277 files changed, 9435 insertions, 2819 deletions
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index bd24eab93b50..35c255002001 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -415,6 +415,9 @@ public:
     append(IL.begin(), IL.end());
   }
 
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
   void assign(size_type NumElts, const T &Elt) {
     clear();
     if (this->capacity() < NumElts)
@@ -423,6 +426,11 @@ public:
     std::uninitialized_fill(this->begin(), this->end(), Elt);
   }
 
+  template <typename in_iter> void assign(in_iter in_start, in_iter in_end) {
+    clear();
+    append(in_start, in_end);
+  }
+
   void assign(std::initializer_list<T> IL) {
     clear();
     append(IL);
diff --git a/include/llvm/Analysis/OrderedBasicBlock.h b/include/llvm/Analysis/OrderedBasicBlock.h
index 5aa813eb4832..2e716af1f60d 100644
--- a/include/llvm/Analysis/OrderedBasicBlock.h
+++ b/include/llvm/Analysis/OrderedBasicBlock.h
@@ -58,6 +58,7 @@ public:
   /// comes before \p B in \p BB. This is a simplification that considers
   /// cached instruction positions and ignores other basic blocks, being
   /// only relevant to compare relative instructions positions inside \p BB.
+  /// Returns false for A == B.
   bool dominates(const Instruction *A, const Instruction *B);
 };
 
diff --git a/include/llvm/Analysis/RegionPass.h b/include/llvm/Analysis/RegionPass.h
index b5f38139abf2..515b362e5407 100644
--- a/include/llvm/Analysis/RegionPass.h
+++ b/include/llvm/Analysis/RegionPass.h
@@ -78,6 +78,11 @@ public:
     return PMT_RegionPassManager;
   }
   //@}
+
+protected:
+  /// Optional passes call this function to check whether the pass should be
+  /// skipped. This is the case when optimization bisect is over the limit.
+  bool skipRegion(Region &R) const;
 };
 
 /// @brief The pass manager to schedule RegionPasses.
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 1d715b590ab7..8ee9712b93d8 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -636,7 +636,7 @@ private:
     /// @}
 
   public:
-    BackedgeTakenInfo() : MaxAndComplete(nullptr, 0) {}
+    BackedgeTakenInfo() : MaxAndComplete(nullptr, 0), MaxOrZero(false) {}
 
     BackedgeTakenInfo(BackedgeTakenInfo &&) = default;
     BackedgeTakenInfo &operator=(BackedgeTakenInfo &&) = default;
diff --git a/include/llvm/CodeGen/MachineRegionInfo.h b/include/llvm/CodeGen/MachineRegionInfo.h
index 21f847c7e5ba..8394b58d0a16 100644
--- a/include/llvm/CodeGen/MachineRegionInfo.h
+++ b/include/llvm/CodeGen/MachineRegionInfo.h
@@ -10,83 +10,77 @@
 #ifndef LLVM_CODEGEN_MACHINEREGIONINFO_H
 #define LLVM_CODEGEN_MACHINEREGIONINFO_H
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
-
+#include <cassert>
 
 namespace llvm {
 
-class MachineDominatorTree;
 struct MachinePostDominatorTree;
 class MachineRegion;
 class MachineRegionNode;
 class MachineRegionInfo;
 
-template<>
-struct RegionTraits<MachineFunction> {
-  typedef MachineFunction FuncT;
-  typedef MachineBasicBlock BlockT;
-  typedef MachineRegion RegionT;
-  typedef MachineRegionNode RegionNodeT;
-  typedef MachineRegionInfo RegionInfoT;
-  typedef MachineDominatorTree DomTreeT;
-  typedef MachineDomTreeNode DomTreeNodeT;
-  typedef MachinePostDominatorTree PostDomTreeT;
-  typedef MachineDominanceFrontier DomFrontierT;
-  typedef MachineInstr InstT;
-  typedef MachineLoop LoopT;
-  typedef MachineLoopInfo LoopInfoT;
+template <> struct RegionTraits<MachineFunction> {
+  using FuncT = MachineFunction;
+  using BlockT = MachineBasicBlock;
+  using RegionT = MachineRegion;
+  using RegionNodeT = MachineRegionNode;
+  using RegionInfoT = MachineRegionInfo;
+  using DomTreeT = MachineDominatorTree;
+  using DomTreeNodeT = MachineDomTreeNode;
+  using PostDomTreeT = MachinePostDominatorTree;
+  using DomFrontierT = MachineDominanceFrontier;
+  using InstT = MachineInstr;
+  using LoopT = MachineLoop;
+  using LoopInfoT = MachineLoopInfo;
 
   static unsigned getNumSuccessors(MachineBasicBlock *BB) {
     return BB->succ_size();
   }
 };
 
-
 class MachineRegionNode : public RegionNodeBase<RegionTraits<MachineFunction>> {
 public:
-  inline MachineRegionNode(MachineRegion *Parent,
-                           MachineBasicBlock *Entry,
+  inline MachineRegionNode(MachineRegion *Parent, MachineBasicBlock *Entry,
                            bool isSubRegion = false)
-    : RegionNodeBase<RegionTraits<MachineFunction>>(Parent, Entry, isSubRegion) {
-
-  }
+      : RegionNodeBase<RegionTraits<MachineFunction>>(Parent, Entry,
+                                                      isSubRegion) {}
 
   bool operator==(const MachineRegion &RN) const {
-    return this == reinterpret_cast<const MachineRegionNode*>(&RN);
+    return this == reinterpret_cast<const MachineRegionNode *>(&RN);
   }
 };
 
 class MachineRegion : public RegionBase<RegionTraits<MachineFunction>> {
 public:
   MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
-                MachineRegionInfo* RI,
-                MachineDominatorTree *DT, MachineRegion *Parent = nullptr);
+                MachineRegionInfo *RI, MachineDominatorTree *DT,
+                MachineRegion *Parent = nullptr);
   ~MachineRegion();
 
   bool operator==(const MachineRegionNode &RN) const {
-    return &RN == reinterpret_cast<const MachineRegionNode*>(this);
+    return &RN == reinterpret_cast<const MachineRegionNode *>(this);
   }
 };
 
 class MachineRegionInfo : public RegionInfoBase<RegionTraits<MachineFunction>> {
 public:
   explicit MachineRegionInfo();
-
   ~MachineRegionInfo() override;
 
   // updateStatistics - Update statistic about created regions.
   void updateStatistics(MachineRegion *R) final;
 
-  void recalculate(MachineFunction &F,
-                   MachineDominatorTree *DT,
-                   MachinePostDominatorTree *PDT,
-                   MachineDominanceFrontier *DF);
+  void recalculate(MachineFunction &F, MachineDominatorTree *DT,
+                   MachinePostDominatorTree *PDT, MachineDominanceFrontier *DF);
 };
 
 class MachineRegionInfoPass : public MachineFunctionPass {
@@ -94,17 +88,13 @@ class MachineRegionInfoPass : public MachineFunctionPass {
 
 public:
   static char ID;
-  explicit MachineRegionInfoPass();
 
+  explicit MachineRegionInfoPass();
   ~MachineRegionInfoPass() override;
 
-  MachineRegionInfo &getRegionInfo() {
-    return RI;
-  }
+  MachineRegionInfo &getRegionInfo() { return RI; }
 
-  const MachineRegionInfo &getRegionInfo() const {
-    return RI;
-  }
+  const MachineRegionInfo &getRegionInfo() const { return RI; }
 
   /// @name MachineFunctionPass interface
   //@{
@@ -117,66 +107,76 @@ public:
   //@}
 };
 
-
 template <>
 template <>
-inline MachineBasicBlock* RegionNodeBase<RegionTraits<MachineFunction>>::getNodeAs<MachineBasicBlock>() const {
+inline MachineBasicBlock *
+RegionNodeBase<RegionTraits<MachineFunction>>::getNodeAs<MachineBasicBlock>()
+    const {
   assert(!isSubRegion() && "This is not a MachineBasicBlock RegionNode!");
   return getEntry();
 }
 
-template<>
-template<>
-inline MachineRegion* RegionNodeBase<RegionTraits<MachineFunction>>::getNodeAs<MachineRegion>() const {
+template <>
+template <>
+inline MachineRegion *
+RegionNodeBase<RegionTraits<MachineFunction>>::getNodeAs<MachineRegion>()
+    const {
   assert(isSubRegion() && "This is not a subregion RegionNode!");
-  auto Unconst = const_cast<RegionNodeBase<RegionTraits<MachineFunction>>*>(this);
-  return reinterpret_cast<MachineRegion*>(Unconst);
+  auto Unconst =
+      const_cast<RegionNodeBase<RegionTraits<MachineFunction>> *>(this);
+  return reinterpret_cast<MachineRegion *>(Unconst);
 }
 
-
 RegionNodeGraphTraits(MachineRegionNode, MachineBasicBlock, MachineRegion);
-RegionNodeGraphTraits(const MachineRegionNode, MachineBasicBlock, MachineRegion);
+RegionNodeGraphTraits(const MachineRegionNode, MachineBasicBlock,
+                      MachineRegion);
 
 RegionGraphTraits(MachineRegion, MachineRegionNode);
 RegionGraphTraits(const MachineRegion, const MachineRegionNode);
 
-template <> struct GraphTraits<MachineRegionInfo*>
-  : public GraphTraits<FlatIt<MachineRegionNode*> > {
-  typedef df_iterator<NodeRef, df_iterator_default_set<NodeRef>, false,
-                      GraphTraits<FlatIt<NodeRef>>>
-      nodes_iterator;
+template <>
+struct GraphTraits<MachineRegionInfo *>
+    : public GraphTraits<FlatIt<MachineRegionNode *>> {
+  using nodes_iterator = df_iterator<NodeRef, df_iterator_default_set<NodeRef>,
+                                     false, GraphTraits<FlatIt<NodeRef>>>;
 
   static NodeRef getEntryNode(MachineRegionInfo *RI) {
-    return GraphTraits<FlatIt<MachineRegion*> >::getEntryNode(RI->getTopLevelRegion());
+    return GraphTraits<FlatIt<MachineRegion *>>::getEntryNode(
+        RI->getTopLevelRegion());
   }
-  static nodes_iterator nodes_begin(MachineRegionInfo* RI) {
+
+  static nodes_iterator nodes_begin(MachineRegionInfo *RI) {
     return nodes_iterator::begin(getEntryNode(RI));
   }
+
   static nodes_iterator nodes_end(MachineRegionInfo *RI) {
     return nodes_iterator::end(getEntryNode(RI));
   }
 };
 
-template <> struct GraphTraits<MachineRegionInfoPass*>
-  : public GraphTraits<MachineRegionInfo *> {
-  typedef df_iterator<NodeRef, df_iterator_default_set<NodeRef>, false,
-                      GraphTraits<FlatIt<NodeRef>>>
-      nodes_iterator;
+template <>
+struct GraphTraits<MachineRegionInfoPass *>
+    : public GraphTraits<MachineRegionInfo *> {
+  using nodes_iterator = df_iterator<NodeRef, df_iterator_default_set<NodeRef>,
+                                     false, GraphTraits<FlatIt<NodeRef>>>;
 
   static NodeRef getEntryNode(MachineRegionInfoPass *RI) {
-    return GraphTraits<MachineRegionInfo*>::getEntryNode(&RI->getRegionInfo());
+    return GraphTraits<MachineRegionInfo *>::getEntryNode(&RI->getRegionInfo());
   }
-  static nodes_iterator nodes_begin(MachineRegionInfoPass* RI) {
-    return GraphTraits<MachineRegionInfo*>::nodes_begin(&RI->getRegionInfo());
+
+  static nodes_iterator nodes_begin(MachineRegionInfoPass *RI) {
+    return GraphTraits<MachineRegionInfo *>::nodes_begin(&RI->getRegionInfo());
   }
+
   static nodes_iterator nodes_end(MachineRegionInfoPass *RI) {
-    return GraphTraits<MachineRegionInfo*>::nodes_end(&RI->getRegionInfo());
+    return GraphTraits<MachineRegionInfo *>::nodes_end(&RI->getRegionInfo());
   }
 };
 
 extern template class RegionBase<RegionTraits<MachineFunction>>;
 extern template class RegionNodeBase<RegionTraits<MachineFunction>>;
 extern template class RegionInfoBase<RegionTraits<MachineFunction>>;
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_MACHINEREGIONINFO_H
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 1026654da3d7..c027783aae55 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -14,11 +14,13 @@
 #ifndef LLVM_CODEGEN_MACHINEREGISTERINFO_H
 #define LLVM_CODEGEN_MACHINEREGISTERINFO_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -41,8 +43,8 @@ namespace llvm {
 class PSetIterator;
 
 /// Convenient type to represent either a register class or a register bank.
-typedef PointerUnion<const TargetRegisterClass *, const RegisterBank *>
-    RegClassOrRegBank;
+using RegClassOrRegBank =
+    PointerUnion<const TargetRegisterClass *, const RegisterBank *>;
 
 /// MachineRegisterInfo - Keep track of information for virtual and physical
 /// registers, including vreg register classes, use/def chains for registers,
@@ -125,7 +127,7 @@ private:
   /// started.
   BitVector ReservedRegs;
 
-  typedef DenseMap<unsigned, LLT> VRegToTypeMap;
+  using VRegToTypeMap = DenseMap<unsigned, LLT>;
   /// Map generic virtual registers to their actual size.
   mutable std::unique_ptr<VRegToTypeMap> VRegToType;
 
@@ -266,8 +268,8 @@ public:
 
   /// reg_iterator/reg_begin/reg_end - Walk all defs and uses of the specified
   /// register.
-  typedef defusechain_iterator<true,true,false,true,false,false>
-          reg_iterator;
+  using reg_iterator =
+      defusechain_iterator<true, true, false, true, false, false>;
   reg_iterator reg_begin(unsigned RegNo) const {
     return reg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -279,8 +281,8 @@ public:
 
   /// reg_instr_iterator/reg_instr_begin/reg_instr_end - Walk all defs and uses
   /// of the specified register, stepping by MachineInstr.
-  typedef defusechain_instr_iterator<true,true,false,false,true,false>
-          reg_instr_iterator;
+  using reg_instr_iterator =
+      defusechain_instr_iterator<true, true, false, false, true, false>;
   reg_instr_iterator reg_instr_begin(unsigned RegNo) const {
     return reg_instr_iterator(getRegUseDefListHead(RegNo));
   }
@@ -295,8 +297,8 @@ public:
 
   /// reg_bundle_iterator/reg_bundle_begin/reg_bundle_end - Walk all defs and uses
   /// of the specified register, stepping by bundle.
-  typedef defusechain_instr_iterator<true,true,false,false,false,true>
-          reg_bundle_iterator;
+  using reg_bundle_iterator =
+      defusechain_instr_iterator<true, true, false, false, false, true>;
   reg_bundle_iterator reg_bundle_begin(unsigned RegNo) const {
     return reg_bundle_iterator(getRegUseDefListHead(RegNo));
   }
@@ -314,8 +316,8 @@ public:
 
   /// reg_nodbg_iterator/reg_nodbg_begin/reg_nodbg_end - Walk all defs and uses
   /// of the specified register, skipping those marked as Debug.
-  typedef defusechain_iterator<true,true,true,true,false,false>
-          reg_nodbg_iterator;
+  using reg_nodbg_iterator =
+      defusechain_iterator<true, true, true, true, false, false>;
   reg_nodbg_iterator reg_nodbg_begin(unsigned RegNo) const {
     return reg_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -331,8 +333,8 @@ public:
   /// reg_instr_nodbg_iterator/reg_instr_nodbg_begin/reg_instr_nodbg_end - Walk
   /// all defs and uses of the specified register, stepping by MachineInstr,
   /// skipping those marked as Debug.
-  typedef defusechain_instr_iterator<true,true,true,false,true,false>
-          reg_instr_nodbg_iterator;
+  using reg_instr_nodbg_iterator =
+      defusechain_instr_iterator<true, true, true, false, true, false>;
   reg_instr_nodbg_iterator reg_instr_nodbg_begin(unsigned RegNo) const {
     return reg_instr_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -348,8 +350,8 @@ public:
   /// reg_bundle_nodbg_iterator/reg_bundle_nodbg_begin/reg_bundle_nodbg_end - Walk
   /// all defs and uses of the specified register, stepping by bundle,
   /// skipping those marked as Debug.
-  typedef defusechain_instr_iterator<true,true,true,false,false,true>
-          reg_bundle_nodbg_iterator;
+  using reg_bundle_nodbg_iterator =
+      defusechain_instr_iterator<true, true, true, false, false, true>;
   reg_bundle_nodbg_iterator reg_bundle_nodbg_begin(unsigned RegNo) const {
     return reg_bundle_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -369,8 +371,8 @@ public:
   }
 
   /// def_iterator/def_begin/def_end - Walk all defs of the specified register.
-  typedef defusechain_iterator<false,true,false,true,false,false>
-          def_iterator;
+  using def_iterator =
+      defusechain_iterator<false, true, false, true, false, false>;
   def_iterator def_begin(unsigned RegNo) const {
     return def_iterator(getRegUseDefListHead(RegNo));
   }
@@ -382,8 +384,8 @@ public:
 
   /// def_instr_iterator/def_instr_begin/def_instr_end - Walk all defs of the
   /// specified register, stepping by MachineInst.
-  typedef defusechain_instr_iterator<false,true,false,false,true,false>
-          def_instr_iterator;
+  using def_instr_iterator =
+      defusechain_instr_iterator<false, true, false, false, true, false>;
   def_instr_iterator def_instr_begin(unsigned RegNo) const {
     return def_instr_iterator(getRegUseDefListHead(RegNo));
   }
@@ -398,8 +400,8 @@ public:
 
   /// def_bundle_iterator/def_bundle_begin/def_bundle_end - Walk all defs of the
   /// specified register, stepping by bundle.
-  typedef defusechain_instr_iterator<false,true,false,false,false,true>
-          def_bundle_iterator;
+  using def_bundle_iterator =
+      defusechain_instr_iterator<false, true, false, false, false, true>;
   def_bundle_iterator def_bundle_begin(unsigned RegNo) const {
     return def_bundle_iterator(getRegUseDefListHead(RegNo));
   }
@@ -425,8 +427,8 @@ public:
   }
 
   /// use_iterator/use_begin/use_end - Walk all uses of the specified register.
-  typedef defusechain_iterator<true,false,false,true,false,false>
-          use_iterator;
+  using use_iterator =
+      defusechain_iterator<true, false, false, true, false, false>;
   use_iterator use_begin(unsigned RegNo) const {
     return use_iterator(getRegUseDefListHead(RegNo));
   }
@@ -438,8 +440,8 @@ public:
 
   /// use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the
   /// specified register, stepping by MachineInstr.
-  typedef defusechain_instr_iterator<true,false,false,false,true,false>
-          use_instr_iterator;
+  using use_instr_iterator =
+      defusechain_instr_iterator<true, false, false, false, true, false>;
   use_instr_iterator use_instr_begin(unsigned RegNo) const {
     return use_instr_iterator(getRegUseDefListHead(RegNo));
   }
@@ -454,8 +456,8 @@ public:
 
   /// use_bundle_iterator/use_bundle_begin/use_bundle_end - Walk all uses of the
   /// specified register, stepping by bundle.
-  typedef defusechain_instr_iterator<true,false,false,false,false,true>
-          use_bundle_iterator;
+  using use_bundle_iterator =
+      defusechain_instr_iterator<true, false, false, false, false, true>;
   use_bundle_iterator use_bundle_begin(unsigned RegNo) const {
     return use_bundle_iterator(getRegUseDefListHead(RegNo));
   }
@@ -482,8 +484,8 @@ public:
 
   /// use_nodbg_iterator/use_nodbg_begin/use_nodbg_end - Walk all uses of the
   /// specified register, skipping those marked as Debug.
-  typedef defusechain_iterator<true,false,true,true,false,false>
-          use_nodbg_iterator;
+  using use_nodbg_iterator =
+      defusechain_iterator<true, false, true, true, false, false>;
   use_nodbg_iterator use_nodbg_begin(unsigned RegNo) const {
     return use_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -499,8 +501,8 @@ public:
   /// use_instr_nodbg_iterator/use_instr_nodbg_begin/use_instr_nodbg_end - Walk
   /// all uses of the specified register, stepping by MachineInstr, skipping
   /// those marked as Debug.
-  typedef defusechain_instr_iterator<true,false,true,false,true,false>
-          use_instr_nodbg_iterator;
+  using use_instr_nodbg_iterator =
+      defusechain_instr_iterator<true, false, true, false, true, false>;
   use_instr_nodbg_iterator use_instr_nodbg_begin(unsigned RegNo) const {
     return use_instr_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -516,8 +518,8 @@ public:
   /// use_bundle_nodbg_iterator/use_bundle_nodbg_begin/use_bundle_nodbg_end - Walk
   /// all uses of the specified register, stepping by bundle, skipping
   /// those marked as Debug.
-  typedef defusechain_instr_iterator<true,false,true,false,false,true>
-          use_bundle_nodbg_iterator;
+  using use_bundle_nodbg_iterator =
+      defusechain_instr_iterator<true, false, true, false, false, true>;
   use_bundle_nodbg_iterator use_bundle_nodbg_begin(unsigned RegNo) const {
     return use_bundle_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
@@ -593,7 +595,6 @@ public:
   /// Return the register class of the specified virtual register.
   /// This shouldn't be used directly unless \p Reg has a register class.
   /// \see getRegClassOrNull when this might happen.
-  ///
   const TargetRegisterClass *getRegClass(unsigned Reg) const {
     assert(VRegInfo[Reg].first.is<const TargetRegisterClass *>() &&
            "Register class not set, wrong accessor");
@@ -620,7 +621,6 @@ public:
   /// a register bank or has been assigned a register class.
   /// \note It is possible to get the register bank from the register class via
   /// RegisterBankInfo::getRegBankFromRegClass.
-  ///
   const RegisterBank *getRegBankOrNull(unsigned Reg) const {
     const RegClassOrRegBank &Val = VRegInfo[Reg].first;
     return Val.dyn_cast<const RegisterBank *>();
@@ -629,17 +629,14 @@ public:
   /// Return the register bank or register class of \p Reg.
   /// \note Before the register bank gets assigned (i.e., before the
   /// RegBankSelect pass) \p Reg may not have either.
-  ///
   const RegClassOrRegBank &getRegClassOrRegBank(unsigned Reg) const {
     return VRegInfo[Reg].first;
   }
 
   /// setRegClass - Set the register class of the specified virtual register.
-  ///
   void setRegClass(unsigned Reg, const TargetRegisterClass *RC);
 
   /// Set the register bank to \p RegBank for \p Reg.
-  ///
   void setRegBank(unsigned Reg, const RegisterBank &RegBank);
 
   void setRegClassOrRegBank(unsigned Reg,
@@ -653,7 +650,6 @@ public:
   /// new register class, or NULL if no such class exists.
   /// This should only be used when the constraint is known to be trivial, like
   /// GR32 -> GR32_NOSP. Beware of increasing register pressure.
-  ///
   const TargetRegisterClass *constrainRegClass(unsigned Reg,
                                                const TargetRegisterClass *RC,
                                                unsigned MinNumRegs = 0);
@@ -665,12 +661,10 @@ public:
   /// This method can be used after constraints have been removed from a
   /// virtual register, for example after removing instructions or splitting
   /// the live range.
-  ///
   bool recomputeRegClass(unsigned Reg);
 
   /// createVirtualRegister - Create and return a new virtual register in the
   /// function with the specified register class.
-  ///
   unsigned createVirtualRegister(const TargetRegisterClass *RegClass);
 
   /// Accessor for VRegToType. This accessor should only be used
@@ -704,7 +698,6 @@ public:
   unsigned createIncompleteVirtualRegister();
 
   /// getNumVirtRegs - Return the number of virtual registers created.
-  ///
   unsigned getNumVirtRegs() const { return VRegInfo.size(); }
 
   /// clearVirtRegs - Remove all virtual registers (after physreg assignment).
@@ -810,7 +803,6 @@ public:
   ///
   /// Reserved registers may belong to an allocatable register class, but the
   /// target has explicitly requested that they are not used.
-  ///
   bool isReserved(unsigned PhysReg) const {
     return getReservedRegs().test(PhysReg);
   }
@@ -838,8 +830,8 @@ public:
 
   // Iteration support for the live-ins set.  It's kept in sorted order
   // by register number.
-  typedef std::vector<std::pair<unsigned,unsigned>>::const_iterator
-  livein_iterator;
+  using livein_iterator =
+      std::vector<std::pair<unsigned,unsigned>>::const_iterator;
   livein_iterator livein_begin() const { return LiveIns.begin(); }
   livein_iterator livein_end()   const { return LiveIns.end(); }
   bool            livein_empty() const { return LiveIns.empty(); }
@@ -910,10 +902,10 @@ public:
     }
 
   public:
-    typedef std::iterator<std::forward_iterator_tag,
-                          MachineInstr, ptrdiff_t>::reference reference;
-    typedef std::iterator<std::forward_iterator_tag,
-                          MachineInstr, ptrdiff_t>::pointer pointer;
+    using reference = std::iterator<std::forward_iterator_tag,
+                                    MachineInstr, ptrdiff_t>::reference;
+    using pointer = std::iterator<std::forward_iterator_tag,
+                                  MachineInstr, ptrdiff_t>::pointer;
 
     defusechain_iterator() = default;
 
@@ -1016,10 +1008,10 @@ public:
     }
 
   public:
-    typedef std::iterator<std::forward_iterator_tag,
-                          MachineInstr, ptrdiff_t>::reference reference;
-    typedef std::iterator<std::forward_iterator_tag,
-                          MachineInstr, ptrdiff_t>::pointer pointer;
+    using reference = std::iterator<std::forward_iterator_tag,
+                                    MachineInstr, ptrdiff_t>::reference;
+    using pointer = std::iterator<std::forward_iterator_tag,
+                                  MachineInstr, ptrdiff_t>::pointer;
 
     defusechain_instr_iterator() = default;
 
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 6b2a16e1d36e..3b02ec400aba 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -104,10 +104,15 @@ extern cl::opt<bool> ForceBottomUp;
 
 class LiveIntervals;
 class MachineDominatorTree;
+class MachineFunction;
+class MachineInstr;
 class MachineLoopInfo;
 class RegisterClassInfo;
 class SchedDFSResult;
 class ScheduleHazardRecognizer;
+class TargetInstrInfo;
+class TargetPassConfig;
+class TargetRegisterInfo;
 
 /// MachineSchedContext provides enough context from the MachineScheduler pass
 /// for the target to instantiate a scheduler.
@@ -129,10 +134,10 @@ struct MachineSchedContext {
 /// schedulers.
 class MachineSchedRegistry : public MachinePassRegistryNode {
 public:
-  typedef ScheduleDAGInstrs *(*ScheduleDAGCtor)(MachineSchedContext *);
+  using ScheduleDAGCtor = ScheduleDAGInstrs *(*)(MachineSchedContext *);
 
   // RegisterPassParser requires a (misnamed) FunctionPassCtor type.
-  typedef ScheduleDAGCtor FunctionPassCtor;
+  using FunctionPassCtor = ScheduleDAGCtor;
 
   static MachinePassRegistry Registry;
 
@@ -527,7 +532,7 @@ public:
 
   unsigned size() const { return Queue.size(); }
 
-  typedef std::vector<SUnit*>::iterator iterator;
+  using iterator = std::vector<SUnit*>::iterator;
 
   iterator begin() { return Queue.begin(); }
 
diff --git a/include/llvm/CodeGen/PBQP/CostAllocator.h b/include/llvm/CodeGen/PBQP/CostAllocator.h
index 02d39fe383f1..bde451ae1fcc 100644
--- a/include/llvm/CodeGen/PBQP/CostAllocator.h
+++ b/include/llvm/CodeGen/PBQP/CostAllocator.h
@@ -1,4 +1,4 @@
-//===---------- CostAllocator.h - PBQP Cost Allocator -----------*- C++ -*-===//
+//===- CostAllocator.h - PBQP Cost Allocator --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,26 +19,28 @@
 #define LLVM_CODEGEN_PBQP_COSTALLOCATOR_H
 
 #include "llvm/ADT/DenseSet.h"
+#include <algorithm>
+#include <cstdint>
 #include <memory>
-#include <type_traits>
 
 namespace llvm {
 namespace PBQP {
 
-template <typename ValueT>
-class ValuePool {
+template <typename ValueT> class ValuePool {
 public:
-  typedef std::shared_ptr<const ValueT> PoolRef;
+  using PoolRef = std::shared_ptr<const ValueT>;
 
 private:
-
   class PoolEntry : public std::enable_shared_from_this<PoolEntry> {
   public:
     template <typename ValueKeyT>
     PoolEntry(ValuePool &Pool, ValueKeyT Value)
         : Pool(Pool), Value(std::move(Value)) {}
+
     ~PoolEntry() { Pool.removeEntry(this); }
-    const ValueT& getValue() const { return Value; }
+
+    const ValueT &getValue() const { return Value; }
+
   private:
     ValuePool &Pool;
     ValueT Value;
@@ -46,10 +48,10 @@ private:
 
   class PoolEntryDSInfo {
   public:
-    static inline PoolEntry* getEmptyKey() { return nullptr; }
+    static inline PoolEntry *getEmptyKey() { return nullptr; }
 
-    static inline PoolEntry* getTombstoneKey() {
-      return reinterpret_cast<PoolEntry*>(static_cast<uintptr_t>(1));
+    static inline PoolEntry *getTombstoneKey() {
+      return reinterpret_cast<PoolEntry *>(static_cast<uintptr_t>(1));
     }
 
     template <typename ValueKeyT>
@@ -66,8 +68,7 @@ private:
     }
 
     template <typename ValueKeyT1, typename ValueKeyT2>
-    static
-    bool isEqual(const ValueKeyT1 &C1, const ValueKeyT2 &C2) {
+    static bool isEqual(const ValueKeyT1 &C1, const ValueKeyT2 &C2) {
       return C1 == C2;
     }
 
@@ -83,10 +84,9 @@ private:
         return P1 == P2;
       return isEqual(P1->getValue(), P2);
     }
-
   };
 
-  typedef DenseSet<PoolEntry*, PoolEntryDSInfo> EntrySetT;
+  using EntrySetT = DenseSet<PoolEntry *, PoolEntryDSInfo>;
 
   EntrySetT EntrySet;
 
@@ -105,28 +105,31 @@ public:
   }
 };
 
-template <typename VectorT, typename MatrixT>
-class PoolCostAllocator {
+template <typename VectorT, typename MatrixT> class PoolCostAllocator {
 private:
-  typedef ValuePool<VectorT> VectorCostPool;
-  typedef ValuePool<MatrixT> MatrixCostPool;
+  using VectorCostPool = ValuePool<VectorT>;
+  using MatrixCostPool = ValuePool<MatrixT>;
+
 public:
-  typedef VectorT Vector;
-  typedef MatrixT Matrix;
-  typedef typename VectorCostPool::PoolRef VectorPtr;
-  typedef typename MatrixCostPool::PoolRef MatrixPtr;
+  using Vector = VectorT;
+  using Matrix = MatrixT;
+  using VectorPtr = typename VectorCostPool::PoolRef;
+  using MatrixPtr = typename MatrixCostPool::PoolRef;
+
+  template <typename VectorKeyT> VectorPtr getVector(VectorKeyT v) {
+    return VectorPool.getValue(std::move(v));
+  }
 
-  template <typename VectorKeyT>
-  VectorPtr getVector(VectorKeyT v) { return VectorPool.getValue(std::move(v)); }
+  template <typename MatrixKeyT> MatrixPtr getMatrix(MatrixKeyT m) {
+    return MatrixPool.getValue(std::move(m));
+  }
 
-  template <typename MatrixKeyT>
-  MatrixPtr getMatrix(MatrixKeyT m) { return MatrixPool.getValue(std::move(m)); }
 private:
   VectorCostPool VectorPool;
   MatrixCostPool MatrixPool;
 };
 
-} // namespace PBQP
-} // namespace llvm
+} // end namespace PBQP
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_PBQP_COSTALLOCATOR_H
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index 83487e6a808a..e94878ced10d 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -1,4 +1,4 @@
-//===-------------------- Graph.h - PBQP Graph ------------------*- C++ -*-===//
+//===- Graph.h - PBQP Graph -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,16 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef LLVM_CODEGEN_PBQP_GRAPH_H
 #define LLVM_CODEGEN_PBQP_GRAPH_H
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
 #include <algorithm>
 #include <cassert>
+#include <iterator>
 #include <limits>
-#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -28,8 +26,8 @@ namespace PBQP {
 
   class GraphBase {
   public:
-    typedef unsigned NodeId;
-    typedef unsigned EdgeId;
+    using NodeId = unsigned;
+    using EdgeId = unsigned;
 
     /// @brief Returns a value representing an invalid (non-existent) node.
     static NodeId invalidNodeId() {
@@ -48,32 +46,32 @@ namespace PBQP {
   template <typename SolverT>
   class Graph : public GraphBase {
   private:
-    typedef typename SolverT::CostAllocator CostAllocator;
+    using CostAllocator = typename SolverT::CostAllocator;
+
   public:
-    typedef typename SolverT::RawVector RawVector;
-    typedef typename SolverT::RawMatrix RawMatrix;
-    typedef typename SolverT::Vector Vector;
-    typedef typename SolverT::Matrix Matrix;
-    typedef typename CostAllocator::VectorPtr VectorPtr;
-    typedef typename CostAllocator::MatrixPtr MatrixPtr;
-    typedef typename SolverT::NodeMetadata NodeMetadata;
-    typedef typename SolverT::EdgeMetadata EdgeMetadata;
-    typedef typename SolverT::GraphMetadata GraphMetadata;
+    using RawVector = typename SolverT::RawVector;
+    using RawMatrix = typename SolverT::RawMatrix;
+    using Vector = typename SolverT::Vector;
+    using Matrix = typename SolverT::Matrix;
+    using VectorPtr = typename CostAllocator::VectorPtr;
+    using MatrixPtr = typename CostAllocator::MatrixPtr;
+    using NodeMetadata = typename SolverT::NodeMetadata;
+    using EdgeMetadata = typename SolverT::EdgeMetadata;
+    using GraphMetadata = typename SolverT::GraphMetadata;
 
   private:
-
     class NodeEntry {
     public:
-      typedef std::vector<EdgeId> AdjEdgeList;
-      typedef AdjEdgeList::size_type AdjEdgeIdx;
-      typedef AdjEdgeList::const_iterator AdjEdgeItr;
+      using AdjEdgeList = std::vector<EdgeId>;
+      using AdjEdgeIdx = AdjEdgeList::size_type;
+      using AdjEdgeItr = AdjEdgeList::const_iterator;
+
+      NodeEntry(VectorPtr Costs) : Costs(std::move(Costs)) {}
 
       static AdjEdgeIdx getInvalidAdjEdgeIdx() {
         return std::numeric_limits<AdjEdgeIdx>::max();
       }
 
-      NodeEntry(VectorPtr Costs) : Costs(std::move(Costs)) {}
-
       AdjEdgeIdx addAdjEdgeId(EdgeId EId) {
         AdjEdgeIdx Idx = AdjEdgeIds.size();
         AdjEdgeIds.push_back(EId);
@@ -96,6 +94,7 @@ namespace PBQP {
 
       VectorPtr Costs;
       NodeMetadata Metadata;
+
     private:
       AdjEdgeList AdjEdgeIds;
     };
@@ -150,8 +149,10 @@ namespace PBQP {
 
       NodeId getN1Id() const { return NIds[0]; }
       NodeId getN2Id() const { return NIds[1]; }
+
       MatrixPtr Costs;
       EdgeMetadata Metadata;
+
     private:
       NodeId NIds[2];
       typename NodeEntry::AdjEdgeIdx ThisEdgeAdjIdxs[2];
@@ -161,18 +162,20 @@ namespace PBQP {
 
     GraphMetadata Metadata;
     CostAllocator CostAlloc;
-    SolverT *Solver;
+    SolverT *Solver = nullptr;
 
-    typedef std::vector<NodeEntry> NodeVector;
-    typedef std::vector<NodeId> FreeNodeVector;
+    using NodeVector = std::vector<NodeEntry>;
+    using FreeNodeVector = std::vector<NodeId>;
     NodeVector Nodes;
     FreeNodeVector FreeNodeIds;
 
-    typedef std::vector<EdgeEntry> EdgeVector;
-    typedef std::vector<EdgeId> FreeEdgeVector;
+    using EdgeVector = std::vector<EdgeEntry>;
+    using FreeEdgeVector = std::vector<EdgeId>;
     EdgeVector Edges;
     FreeEdgeVector FreeEdgeIds;
 
+    Graph(const Graph &Other) {}
+
     // ----- INTERNAL METHODS -----
 
     NodeEntry &getNode(NodeId NId) {
@@ -220,20 +223,18 @@ namespace PBQP {
       return EId;
     }
 
-    Graph(const Graph &Other) {}
     void operator=(const Graph &Other) {}
 
   public:
-
-    typedef typename NodeEntry::AdjEdgeItr AdjEdgeItr;
+    using AdjEdgeItr = typename NodeEntry::AdjEdgeItr;
 
     class NodeItr {
     public:
-      typedef std::forward_iterator_tag iterator_category;
-      typedef NodeId value_type;
-      typedef int difference_type;
-      typedef NodeId* pointer;
-      typedef NodeId& reference;
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = NodeId;
+      using difference_type = int;
+      using pointer = NodeId *;
+      using reference = NodeId &;
 
       NodeItr(NodeId CurNId, const Graph &G)
         : CurNId(CurNId), EndNId(G.Nodes.size()), FreeNodeIds(G.FreeNodeIds) {
@@ -283,53 +284,65 @@ namespace PBQP {
 
     class NodeIdSet {
     public:
-      NodeIdSet(const Graph &G) : G(G) { }
+      NodeIdSet(const Graph &G) : G(G) {}
+
       NodeItr begin() const { return NodeItr(0, G); }
       NodeItr end() const { return NodeItr(G.Nodes.size(), G); }
+
       bool empty() const { return G.Nodes.empty(); }
+
       typename NodeVector::size_type size() const {
         return G.Nodes.size() - G.FreeNodeIds.size();
       }
+
     private:
       const Graph& G;
     };
 
     class EdgeIdSet {
     public:
-      EdgeIdSet(const Graph &G) : G(G) { }
+      EdgeIdSet(const Graph &G) : G(G) {}
+
       EdgeItr begin() const { return EdgeItr(0, G); }
       EdgeItr end() const { return EdgeItr(G.Edges.size(), G); }
+
       bool empty() const { return G.Edges.empty(); }
+
       typename NodeVector::size_type size() const {
         return G.Edges.size() - G.FreeEdgeIds.size();
       }
+
     private:
       const Graph& G;
     };
 
     class AdjEdgeIdSet {
     public:
-      AdjEdgeIdSet(const NodeEntry &NE) : NE(NE) { }
+      AdjEdgeIdSet(const NodeEntry &NE) : NE(NE) {}
+
       typename NodeEntry::AdjEdgeItr begin() const {
         return NE.getAdjEdgeIds().begin();
       }
+
       typename NodeEntry::AdjEdgeItr end() const {
         return NE.getAdjEdgeIds().end();
       }
+
       bool empty() const { return NE.getAdjEdgeIds().empty(); }
+
       typename NodeEntry::AdjEdgeList::size_type size() const {
         return NE.getAdjEdgeIds().size();
       }
+
     private:
       const NodeEntry &NE;
     };
 
     /// @brief Construct an empty PBQP graph.
-    Graph() : Solver(nullptr) {}
+    Graph() = default;
 
     /// @brief Construct an empty PBQP graph with the given graph metadata.
-    Graph(GraphMetadata Metadata)
-        : Metadata(std::move(Metadata)), Solver(nullptr) {}
+    Graph(GraphMetadata Metadata) : Metadata(std::move(Metadata)) {}
 
     /// @brief Get a reference to the graph metadata.
     GraphMetadata& getMetadata() { return Metadata; }
@@ -656,7 +669,7 @@ namespace PBQP {
     }
   };
 
-}  // namespace PBQP
-}  // namespace llvm
+} // end namespace PBQP
+} // end namespace llvm
 
 #endif // LLVM_CODEGEN_PBQP_GRAPH_HPP
diff --git a/include/llvm/CodeGen/PBQP/Math.h b/include/llvm/CodeGen/PBQP/Math.h
index 278787550a43..ba405e816d10 100644
--- a/include/llvm/CodeGen/PBQP/Math.h
+++ b/include/llvm/CodeGen/PBQP/Math.h
@@ -1,4 +1,4 @@
-//===------ Math.h - PBQP Vector and Matrix classes -------------*- C++ -*-===//
+//===- Math.h - PBQP Vector and Matrix classes ------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,20 +11,22 @@
 #define LLVM_CODEGEN_PBQP_MATH_H
 
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include <algorithm>
 #include <cassert>
 #include <functional>
+#include <memory>
 
 namespace llvm {
 namespace PBQP {
 
-typedef float PBQPNum;
+using PBQPNum = float;
 
 /// \brief PBQP Vector class.
 class Vector {
   friend hash_code hash_value(const Vector &);
-public:
 
+public:
   /// \brief Construct a PBQP vector of the given size.
   explicit Vector(unsigned Length)
     : Length(Length), Data(llvm::make_unique<PBQPNum []>(Length)) {}
@@ -120,8 +122,8 @@ OStream& operator<<(OStream &OS, const Vector &V) {
 class Matrix {
 private:
   friend hash_code hash_value(const Matrix &);
-public:
 
+public:
   /// \brief Construct a PBQP Matrix with the given dimensions.
   Matrix(unsigned Rows, unsigned Cols) :
     Rows(Rows), Cols(Cols), Data(llvm::make_unique<PBQPNum []>(Rows * Cols)) {
@@ -253,9 +255,11 @@ OStream& operator<<(OStream &OS, const Matrix &M) {
 template <typename Metadata>
 class MDVector : public Vector {
 public:
-  MDVector(const Vector &v) : Vector(v), md(*this) { }
+  MDVector(const Vector &v) : Vector(v), md(*this) {}
   MDVector(Vector &&v) : Vector(std::move(v)), md(*this) { }
+
   const Metadata& getMetadata() const { return md; }
+
 private:
   Metadata md;
 };
@@ -268,9 +272,11 @@ inline hash_code hash_value(const MDVector<Metadata> &V) {
 template <typename Metadata>
 class MDMatrix : public Matrix {
 public:
-  MDMatrix(const Matrix &m) : Matrix(m), md(*this) { }
+  MDMatrix(const Matrix &m) : Matrix(m), md(*this) {}
   MDMatrix(Matrix &&m) : Matrix(std::move(m)), md(*this) { }
+
   const Metadata& getMetadata() const { return md; }
+
 private:
   Metadata md;
 };
@@ -280,7 +286,7 @@ inline hash_code hash_value(const MDMatrix<Metadata> &M) {
   return hash_value(static_cast<const Matrix&>(M));
 }
 
-} // namespace PBQP
-} // namespace llvm
+} // end namespace PBQP
+} // end namespace llvm
 
 #endif // LLVM_CODEGEN_PBQP_MATH_H
diff --git a/include/llvm/CodeGen/PBQP/ReductionRules.h b/include/llvm/CodeGen/PBQP/ReductionRules.h
index d4a544bfe721..8aeb51936760 100644
--- a/include/llvm/CodeGen/PBQP/ReductionRules.h
+++ b/include/llvm/CodeGen/PBQP/ReductionRules.h
@@ -1,4 +1,4 @@
-//===----------- ReductionRules.h - Reduction Rules -------------*- C++ -*-===//
+//===- ReductionRules.h - Reduction Rules -----------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,8 @@
 #include "Graph.h"
 #include "Math.h"
 #include "Solution.h"
+#include <cassert>
+#include <limits>
 
 namespace llvm {
 namespace PBQP {
@@ -27,11 +29,11 @@ namespace PBQP {
   /// neighbor. Notify the problem domain.
   template <typename GraphT>
   void applyR1(GraphT &G, typename GraphT::NodeId NId) {
-    typedef typename GraphT::NodeId NodeId;
-    typedef typename GraphT::EdgeId EdgeId;
-    typedef typename GraphT::Vector Vector;
-    typedef typename GraphT::Matrix Matrix;
-    typedef typename GraphT::RawVector RawVector;
+    using NodeId = typename GraphT::NodeId;
+    using EdgeId = typename GraphT::EdgeId;
+    using Vector = typename GraphT::Vector;
+    using Matrix = typename GraphT::Matrix;
+    using RawVector = typename GraphT::RawVector;
 
     assert(G.getNodeDegree(NId) == 1 &&
            "R1 applied to node with degree != 1.");
@@ -71,11 +73,11 @@ namespace PBQP {
 
   template <typename GraphT>
   void applyR2(GraphT &G, typename GraphT::NodeId NId) {
-    typedef typename GraphT::NodeId NodeId;
-    typedef typename GraphT::EdgeId EdgeId;
-    typedef typename GraphT::Vector Vector;
-    typedef typename GraphT::Matrix Matrix;
-    typedef typename GraphT::RawMatrix RawMatrix;
+    using NodeId = typename GraphT::NodeId;
+    using EdgeId = typename GraphT::EdgeId;
+    using Vector = typename GraphT::Vector;
+    using Matrix = typename GraphT::Matrix;
+    using RawMatrix = typename GraphT::RawMatrix;
 
     assert(G.getNodeDegree(NId) == 2 &&
            "R2 applied to node with degree != 2.");
@@ -177,9 +179,9 @@ namespace PBQP {
   //        state.
   template <typename GraphT, typename StackT>
   Solution backpropagate(GraphT& G, StackT stack) {
-    typedef GraphBase::NodeId NodeId;
-    typedef typename GraphT::Matrix Matrix;
-    typedef typename GraphT::RawVector RawVector;
+    using NodeId = GraphBase::NodeId;
+    using Matrix = typename GraphT::Matrix;
+    using RawVector = typename GraphT::RawVector;
 
     Solution s;
 
@@ -215,7 +217,7 @@ namespace PBQP {
     return s;
   }
 
-} // namespace PBQP
-} // namespace llvm
+} // end namespace PBQP
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_PBQP_REDUCTIONRULES_H
diff --git a/include/llvm/CodeGen/PBQP/Solution.h b/include/llvm/CodeGen/PBQP/Solution.h
index d96b5eac4520..8d5d2374679d 100644
--- a/include/llvm/CodeGen/PBQP/Solution.h
+++ b/include/llvm/CodeGen/PBQP/Solution.h
@@ -26,7 +26,7 @@ namespace PBQP {
   /// To get the selection for each node in the problem use the getSelection method.
   class Solution {
   private:
-    typedef std::map<GraphBase::NodeId, unsigned> SelectionsMap;
+    using SelectionsMap = std::map<GraphBase::NodeId, unsigned>;
     SelectionsMap selections;
 
     unsigned r0Reductions = 0;
diff --git a/include/llvm/CodeGen/PBQPRAConstraint.h b/include/llvm/CodeGen/PBQPRAConstraint.h
index 833b9bad613f..269b7a7b3a35 100644
--- a/include/llvm/CodeGen/PBQPRAConstraint.h
+++ b/include/llvm/CodeGen/PBQPRAConstraint.h
@@ -1,4 +1,4 @@
-//===-- RegAllocPBQP.h ------------------------------------------*- C++ -*-===//
+//===- RegAllocPBQP.h -------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,23 +16,22 @@
 #ifndef LLVM_CODEGEN_PBQPRACONSTRAINT_H
 #define LLVM_CODEGEN_PBQPRACONSTRAINT_H
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 
 namespace llvm {
+
 namespace PBQP {
 namespace RegAlloc {
+
 // Forward declare PBQP graph class.
 class PBQPRAGraph;
-}
-}
 
-class LiveIntervals;
-class MachineBlockFrequencyInfo;
-class MachineFunction;
-class TargetRegisterInfo;
+} // end namespace RegAlloc
+} // end namespace PBQP
 
-typedef PBQP::RegAlloc::PBQPRAGraph PBQPRAGraph;
+using PBQPRAGraph = PBQP::RegAlloc::PBQPRAGraph;
 
 /// @brief Abstract base for classes implementing PBQP register allocation
 ///        constraints (e.g. Spill-costs, interference, coalescing).
@@ -40,6 +39,7 @@ class PBQPRAConstraint {
 public:
   virtual ~PBQPRAConstraint() = 0;
   virtual void apply(PBQPRAGraph &G) = 0;
+
 private:
   virtual void anchor();
 };
@@ -59,11 +59,13 @@ public:
     if (C)
       Constraints.push_back(std::move(C));
   }
+
 private:
   std::vector<std::unique_ptr<PBQPRAConstraint>> Constraints;
+
   void anchor() override;
 };
 
-}
+} // end namespace llvm
 
-#endif /* LLVM_CODEGEN_PBQPRACONSTRAINT_H */
+#endif // LLVM_CODEGEN_PBQPRACONSTRAINT_H
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 3bcfc1c4254b..96cfce5b84df 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -140,6 +140,9 @@ namespace llvm {
   /// Greedy register allocator.
   extern char &RAGreedyID;
 
+  /// Basic register allocator.
+  extern char &RABasicID;
+
   /// VirtRegRewriter pass. Rewrite virtual registers to physical registers as
   /// assigned in VirtRegMap.
   extern char &VirtRegRewriterID;
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index 8872a5dc54a1..5b342863eb50 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -130,10 +130,10 @@ inline hash_code hash_value(const AllowedRegVector &OptRegs) {
 /// \brief Holds graph-level metadata relevant to PBQP RA problems.
 class GraphMetadata {
 private:
-  typedef ValuePool<AllowedRegVector> AllowedRegVecPool;
+  using AllowedRegVecPool = ValuePool<AllowedRegVector>;
 
 public:
-  typedef AllowedRegVecPool::PoolRef AllowedRegVecRef;
+  using AllowedRegVecRef = AllowedRegVecPool::PoolRef;
 
   GraphMetadata(MachineFunction &MF,
                 LiveIntervals &LIS,
@@ -167,17 +167,17 @@ private:
 /// \brief Holds solver state and other metadata relevant to each PBQP RA node.
 class NodeMetadata {
 public:
-  typedef RegAlloc::AllowedRegVector AllowedRegVector;
+  using AllowedRegVector = RegAlloc::AllowedRegVector;
 
   // The node's reduction state. The order in this enum is important,
   // as it is assumed nodes can only progress up (i.e. towards being
   // optimally reducible) when reducing the graph.
-  typedef enum {
+  using ReductionState = enum {
     Unprocessed,
     NotProvablyAllocatable,
     ConservativelyAllocatable,
     OptimallyReducible
-  } ReductionState;
+  };
 
   NodeMetadata() = default;
 
@@ -267,23 +267,23 @@ private:
 
 class RegAllocSolverImpl {
 private:
-  typedef MDMatrix<MatrixMetadata> RAMatrix;
+  using RAMatrix = MDMatrix<MatrixMetadata>;
 
 public:
-  typedef PBQP::Vector RawVector;
-  typedef PBQP::Matrix RawMatrix;
-  typedef PBQP::Vector Vector;
-  typedef RAMatrix     Matrix;
-  typedef PBQP::PoolCostAllocator<Vector, Matrix> CostAllocator;
+  using RawVector = PBQP::Vector;
+  using RawMatrix = PBQP::Matrix;
+  using Vector = PBQP::Vector;
+  using Matrix = RAMatrix;
+  using CostAllocator = PBQP::PoolCostAllocator<Vector, Matrix>;
 
-  typedef GraphBase::NodeId NodeId;
-  typedef GraphBase::EdgeId EdgeId;
+  using NodeId = GraphBase::NodeId;
+  using EdgeId = GraphBase::EdgeId;
 
-  typedef RegAlloc::NodeMetadata NodeMetadata;
-  struct EdgeMetadata { };
-  typedef RegAlloc::GraphMetadata GraphMetadata;
+  using NodeMetadata = RegAlloc::NodeMetadata;
+  struct EdgeMetadata {};
+  using GraphMetadata = RegAlloc::GraphMetadata;
 
-  typedef PBQP::Graph<RegAllocSolverImpl> Graph;
+  using Graph = PBQP::Graph<RegAllocSolverImpl>;
 
   RegAllocSolverImpl(Graph &G) : G(G) {}
 
@@ -426,7 +426,7 @@ private:
   std::vector<GraphBase::NodeId> reduce() {
     assert(!G.empty() && "Cannot reduce empty graph.");
 
-    typedef GraphBase::NodeId NodeId;
+    using NodeId = GraphBase::NodeId;
     std::vector<NodeId> NodeStack;
 
     // Consume worklists.
@@ -459,7 +459,6 @@ private:
         ConservativelyAllocatableNodes.erase(NItr);
         NodeStack.push_back(NId);
         G.disconnectAllNeighborsFromNode(NId);
-
       } else if (!NotProvablyAllocatableNodes.empty()) {
         NodeSet::iterator NItr =
           std::min_element(NotProvablyAllocatableNodes.begin(),
@@ -493,7 +492,7 @@ private:
   };
 
   Graph& G;
-  typedef std::set<NodeId> NodeSet;
+  using NodeSet = std::set<NodeId>;
   NodeSet OptimallyReducibleNodes;
   NodeSet ConservativelyAllocatableNodes;
   NodeSet NotProvablyAllocatableNodes;
@@ -501,7 +500,7 @@ private:
 
 class PBQPRAGraph : public PBQP::Graph<RegAllocSolverImpl> {
 private:
-  typedef PBQP::Graph<RegAllocSolverImpl> BaseT;
+  using BaseT = PBQP::Graph<RegAllocSolverImpl>;
 
 public:
   PBQPRAGraph(GraphMetadata Metadata) : BaseT(std::move(Metadata)) {}
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 1f939e72e139..ad1efe18c72d 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -204,6 +204,10 @@ private:
   void setLiveInsUsed(const MachineBasicBlock &MBB);
 };
 
+/// Replaces all frame index virtual registers with physical registers. Uses the
+/// register scavenger to find an appropriate register to use.
+void scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS);
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_REGISTERSCAVENGING_H
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index f5f5bfd45e79..d62bb9bf0b75 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -1,4 +1,4 @@
-//==- ScheduleDAGInstrs.h - MachineInstr Scheduling --------------*- C++ -*-==//
+//===- ScheduleDAGInstrs.h - MachineInstr Scheduling ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,22 +15,38 @@
 #ifndef LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 #define LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
 #include <list>
+#include <utility>
+#include <vector>
 
 namespace llvm {
+
+  class LiveIntervals;
   class MachineFrameInfo;
+  class MachineFunction;
+  class MachineInstr;
   class MachineLoopInfo;
-  class MachineDominatorTree;
-  class RegPressureTracker;
+  class MachineOperand;
+  struct MCSchedClassDesc;
   class PressureDiffs;
+  class PseudoSourceValue;
+  class RegPressureTracker;
+  class UndefValue;
+  class Value;
 
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
@@ -70,31 +86,34 @@ namespace llvm {
   /// Use a SparseMultiSet to track physical registers. Storage is only
   /// allocated once for the pass. It can be cleared in constant time and reused
   /// without any frees.
-  typedef SparseMultiSet<PhysRegSUOper, llvm::identity<unsigned>, uint16_t>
-  Reg2SUnitsMap;
+  using Reg2SUnitsMap =
+      SparseMultiSet<PhysRegSUOper, identity<unsigned>, uint16_t>;
 
   /// Use SparseSet as a SparseMap by relying on the fact that it never
   /// compares ValueT's, only unsigned keys. This allows the set to be cleared
   /// between scheduling regions in constant time as long as ValueT does not
   /// require a destructor.
-  typedef SparseSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMap;
+  using VReg2SUnitMap = SparseSet<VReg2SUnit, VirtReg2IndexFunctor>;
 
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
-  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMultiMap;
+  using VReg2SUnitMultiMap = SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor>;
+
+  using VReg2SUnitOperIdxMultiMap =
+      SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>;
 
-  typedef SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>
-    VReg2SUnitOperIdxMultiMap;
+  using ValueType = PointerUnion<const Value *, const PseudoSourceValue *>;
 
-  typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
   struct UnderlyingObject : PointerIntPair<ValueType, 1, bool> {
     UnderlyingObject(ValueType V, bool MayAlias)
         : PointerIntPair<ValueType, 1, bool>(V, MayAlias) {}
+
     ValueType getValue() const { return getPointer(); }
     bool mayAlias() const { return getInt(); }
   };
-  typedef SmallVector<UnderlyingObject, 4> UnderlyingObjectsVector;
+
+  using UnderlyingObjectsVector = SmallVector<UnderlyingObject, 4>;
 
   /// A ScheduleDAG for scheduling lists of MachineInstr.
   class ScheduleDAGInstrs : public ScheduleDAG {
@@ -114,10 +133,10 @@ namespace llvm {
     /// reordering. A specialized scheduler can override
     /// TargetInstrInfo::isSchedulingBoundary then enable this flag to indicate
     /// it has taken responsibility for scheduling the terminator correctly.
-    bool CanHandleTerminators;
+    bool CanHandleTerminators = false;
 
     /// Whether lane masks should get tracked.
-    bool TrackLaneMasks;
+    bool TrackLaneMasks = false;
 
     // State specific to the current scheduling region.
     // ------------------------------------------------
@@ -155,12 +174,12 @@ namespace llvm {
     /// Tracks the last instructions in this region using each virtual register.
     VReg2SUnitOperIdxMultiMap CurrentVRegUses;
 
-    AliasAnalysis *AAForDep;
+    AliasAnalysis *AAForDep = nullptr;
 
     /// Remember a generic side-effecting instruction as we proceed.
     /// No other SU ever gets scheduled around it (except in the special
     /// case of a huge region that gets reduced).
-    SUnit *BarrierChain;
+    SUnit *BarrierChain = nullptr;
 
   public:
     /// A list of SUnits, used in Value2SUsMap, during DAG construction.
@@ -168,7 +187,7 @@ namespace llvm {
     /// implementation of this data structure, such as a singly linked list
     /// with a memory pool (SmallVector was tried but slow and SparseSet is not
     /// applicable).
-    typedef std::list<SUnit *> SUList;
+    using SUList = std::list<SUnit *>;
 
   protected:
     /// \brief A map from ValueType to SUList, used during DAG construction, as
@@ -216,13 +235,13 @@ namespace llvm {
     /// For an unanalyzable memory access, this Value is used in maps.
     UndefValue *UnknownValue;
 
-    typedef std::vector<std::pair<MachineInstr *, MachineInstr *>>
-      DbgValueVector;
+    using DbgValueVector =
+        std::vector<std::pair<MachineInstr *, MachineInstr *>>;
     /// Remember instruction that precedes DBG_VALUE.
     /// These are generated by buildSchedGraph but persist so they can be
     /// referenced when emitting the final schedule.
     DbgValueVector DbgValues;
-    MachineInstr *FirstDbgValue;
+    MachineInstr *FirstDbgValue = nullptr;
 
     /// Set of live physical registers for updating kill flags.
     LivePhysRegs LiveRegs;
@@ -232,7 +251,7 @@ namespace llvm {
                                const MachineLoopInfo *mli,
                                bool RemoveKillFlags = false);
 
-    ~ScheduleDAGInstrs() override {}
+    ~ScheduleDAGInstrs() override = default;
 
     /// Gets the machine model for instruction scheduling.
     const TargetSchedModel *getSchedModel() const { return &SchedModel; }
@@ -354,6 +373,7 @@ namespace llvm {
       return nullptr;
     return I->second;
   }
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 493122b15704..4b1a375abd57 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ---------*- C++ -*-===//
+//===- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,35 +15,72 @@
 #ifndef LLVM_CODEGEN_SELECTIONDAG_H
 #define LLVM_CODEGEN_SELECTIONDAG_H
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/ilist.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
 #include <cassert>
+#include <cstdint>
+#include <functional>
 #include <map>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 namespace llvm {
 
+class BlockAddress;
+class Constant;
+class ConstantFP;
+class ConstantInt;
+class DataLayout;
+struct fltSemantics;
+class GlobalValue;
 struct KnownBits;
+class LLVMContext;
+class MachineBasicBlock;
 class MachineConstantPoolValue;
-class MachineFunction;
-class MDNode;
+class MCSymbol;
 class OptimizationRemarkEmitter;
 class SDDbgValue;
-class TargetLowering;
+class SelectionDAG;
 class SelectionDAGTargetInfo;
+class TargetLowering;
+class TargetMachine;
+class TargetSubtargetInfo;
+class Value;
 
 class SDVTListNode : public FoldingSetNode {
   friend struct FoldingSetTrait<SDVTListNode>;
+
   /// A reference to an Interned FoldingSetNodeID for this node.
   /// The Allocator in SelectionDAG holds the data.
   /// SDVTList contains all types which are frequently accessed in SelectionDAG.
@@ -55,11 +92,13 @@ class SDVTListNode : public FoldingSetNode {
   /// The hash value for SDVTList is fixed, so cache it to avoid
   /// hash calculation.
   unsigned HashValue;
+
 public:
   SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) :
       FastID(ID), VTs(VT), NumVTs(Num) {
     HashValue = ID.ComputeHash();
   }
+
   SDVTList getSDVTList() {
     SDVTList result = {VTs, NumVTs};
     return result;
@@ -72,12 +111,14 @@ template<> struct FoldingSetTrait<SDVTListNode> : DefaultFoldingSetTrait<SDVTLis
   static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) {
     ID = X.FastID;
   }
+
   static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID,
                      unsigned IDHash, FoldingSetNodeID &TempID) {
     if (X.HashValue != IDHash)
       return false;
     return ID == X.FastID;
   }
+
   static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) {
     return X.HashValue;
   }
@@ -104,13 +145,13 @@ class SDDbgInfo {
   BumpPtrAllocator Alloc;
   SmallVector<SDDbgValue*, 32> DbgValues;
   SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
-  typedef DenseMap<const SDNode*, SmallVector<SDDbgValue*, 2> > DbgValMapType;
+  using DbgValMapType = DenseMap<const SDNode *, SmallVector<SDDbgValue *, 2>>;
   DbgValMapType DbgValMap;
 
-  void operator=(const SDDbgInfo&) = delete;
-  SDDbgInfo(const SDDbgInfo&) = delete;
 public:
-  SDDbgInfo() {}
+  SDDbgInfo() = default;
+  SDDbgInfo(const SDDbgInfo &) = delete;
+  SDDbgInfo &operator=(const SDDbgInfo &) = delete;
 
   void add(SDDbgValue *V, const SDNode *Node, bool isParameter) {
     if (isParameter) {
@@ -144,14 +185,14 @@ public:
     return ArrayRef<SDDbgValue*>();
   }
 
-  typedef SmallVectorImpl<SDDbgValue*>::iterator DbgIterator;
+  using DbgIterator = SmallVectorImpl<SDDbgValue*>::iterator;
+
   DbgIterator DbgBegin() { return DbgValues.begin(); }
   DbgIterator DbgEnd()   { return DbgValues.end(); }
   DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); }
   DbgIterator ByvalParmDbgEnd()   { return ByvalParmDbgValues.end(); }
 };
 
-class SelectionDAG;
 void checkForCycles(const SelectionDAG *DAG, bool force = false);
 
 /// This is used to represent a portion of an LLVM function in a low-level
@@ -167,8 +208,8 @@ void checkForCycles(const SelectionDAG *DAG, bool force = false);
 ///
 class SelectionDAG {
   const TargetMachine &TM;
-  const SelectionDAGTargetInfo *TSI;
-  const TargetLowering *TLI;
+  const SelectionDAGTargetInfo *TSI = nullptr;
+  const TargetLowering *TLI = nullptr;
   MachineFunction *MF;
   LLVMContext *Context;
   CodeGenOpt::Level OptLevel;
@@ -188,9 +229,9 @@ class SelectionDAG {
 
   /// The AllocatorType for allocating SDNodes. We use
   /// pool allocation with recycling.
-  typedef RecyclingAllocator<BumpPtrAllocator, SDNode, sizeof(LargestSDNode),
-                             alignof(MostAlignedSDNode)>
-      NodeAllocatorType;
+  using NodeAllocatorType = RecyclingAllocator<BumpPtrAllocator, SDNode,
+                                               sizeof(LargestSDNode),
+                                               alignof(MostAlignedSDNode)>;
 
   /// Pool allocation for nodes.
   NodeAllocatorType NodeAllocator;
@@ -243,9 +284,11 @@ public:
 
   struct DAGNodeDeletedListener : public DAGUpdateListener {
     std::function<void(SDNode *, SDNode *)> Callback;
+
     DAGNodeDeletedListener(SelectionDAG &DAG,
                            std::function<void(SDNode *, SDNode *)> Callback)
         : DAGUpdateListener(DAG), Callback(std::move(Callback)) {}
+
     void NodeDeleted(SDNode *N, SDNode *E) override { Callback(N, E); }
   };
 
@@ -254,7 +297,7 @@ public:
   /// have legal types. This is important after type legalization since
   /// any illegally typed nodes generated after this point will not experience
   /// type legalization.
-  bool NewNodesMustHaveLegalTypes;
+  bool NewNodesMustHaveLegalTypes = false;
 
 private:
   /// DAGUpdateListener is a friend so it can manipulate the listener stack.
@@ -262,7 +305,7 @@ private:
 
   /// Linked list of registered DAGUpdateListener instances.
   /// This stack is maintained by DAGUpdateListener RAII.
-  DAGUpdateListener *UpdateListeners;
+  DAGUpdateListener *UpdateListeners = nullptr;
 
   /// Implementation of setSubgraphColor.
   /// Return whether we had to truncate the search.
@@ -316,11 +359,10 @@ private:
     Node->OperandList = nullptr;
   }
 
-  void operator=(const SelectionDAG&) = delete;
-  SelectionDAG(const SelectionDAG&) = delete;
-
 public:
-  explicit SelectionDAG(const TargetMachine &TM, llvm::CodeGenOpt::Level);
+  explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
+  SelectionDAG(const SelectionDAG &) = delete;
+  SelectionDAG &operator=(const SelectionDAG &) = delete;
   ~SelectionDAG();
 
   /// Prepare this SelectionDAG to process code in the given MachineFunction.
@@ -364,12 +406,16 @@ public:
   /// Convenience for setting subgraph color attribute.
   void setSubgraphColor(SDNode *N, const char *Color);
 
-  typedef ilist<SDNode>::const_iterator allnodes_const_iterator;
+  using allnodes_const_iterator = ilist<SDNode>::const_iterator;
+
   allnodes_const_iterator allnodes_begin() const { return AllNodes.begin(); }
   allnodes_const_iterator allnodes_end() const { return AllNodes.end(); }
-  typedef ilist<SDNode>::iterator allnodes_iterator;
+
+  using allnodes_iterator = ilist<SDNode>::iterator;
+
   allnodes_iterator allnodes_begin() { return AllNodes.begin(); }
   allnodes_iterator allnodes_end() { return AllNodes.end(); }
+
   ilist<SDNode>::size_type allnodes_size() const {
     return AllNodes.size();
   }
@@ -475,7 +521,6 @@ public:
 
   //===--------------------------------------------------------------------===//
   // Node creation methods.
-  //
 
   /// \brief Create a ConstantSDNode wrapping a constant value.
   /// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
@@ -1251,9 +1296,11 @@ public:
 
   SDDbgInfo::DbgIterator DbgBegin() { return DbgInfo->DbgBegin(); }
   SDDbgInfo::DbgIterator DbgEnd()   { return DbgInfo->DbgEnd(); }
+
   SDDbgInfo::DbgIterator ByvalParmDbgBegin() {
     return DbgInfo->ByvalParmDbgBegin();
   }
+
   SDDbgInfo::DbgIterator ByvalParmDbgEnd()   {
     return DbgInfo->ByvalParmDbgEnd();
   }
@@ -1479,10 +1526,12 @@ private:
 };
 
 template <> struct GraphTraits<SelectionDAG*> : public GraphTraits<SDNode*> {
-  typedef pointer_iterator<SelectionDAG::allnodes_iterator> nodes_iterator;
+  using nodes_iterator = pointer_iterator<SelectionDAG::allnodes_iterator>;
+
   static nodes_iterator nodes_begin(SelectionDAG *G) {
     return nodes_iterator(G->allnodes_begin());
   }
+
   static nodes_iterator nodes_end(SelectionDAG *G) {
     return nodes_iterator(G->allnodes_end());
   }
@@ -1493,7 +1542,6 @@ SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
                                          ArrayRef<SDValue> Ops,
                                          const SDLoc &dl, EVT MemVT,
                                          MachineMemOperand *MMO) {
-
   /// Compose node ID and try to find an existing node.
   FoldingSetNodeID ID;
   unsigned Opcode =
@@ -1524,6 +1572,6 @@ SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
   return SDValue(N, 0);
 }
 
-}  // end namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_SELECTIONDAG_H
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 973c5aac5281..3a4feb322092 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -37,6 +37,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
@@ -53,14 +54,18 @@
 
 namespace llvm {
 
-class SelectionDAG;
+class APInt;
+class Constant;
+template <typename T> struct DenseMapInfo;
 class GlobalValue;
 class MachineBasicBlock;
 class MachineConstantPoolValue;
+class MCSymbol;
+class raw_ostream;
 class SDNode;
+class SelectionDAG;
+class Type;
 class Value;
-class MCSymbol;
-template <typename T> struct DenseMapInfo;
 
 void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr,
                     bool force = false);
@@ -229,13 +234,15 @@ template <> struct isPodLike<SDValue> { static const bool value = true; };
 /// Allow casting operators to work directly on
 /// SDValues as if they were SDNode*'s.
 template<> struct simplify_type<SDValue> {
-  typedef SDNode* SimpleType;
+  using SimpleType = SDNode *;
+
   static SimpleType getSimplifiedValue(SDValue &Val) {
     return Val.getNode();
   }
 };
 template<> struct simplify_type<const SDValue> {
-  typedef /*const*/ SDNode* SimpleType;
+  using SimpleType = /*const*/ SDNode *;
+
   static SimpleType getSimplifiedValue(const SDValue &Val) {
     return Val.getNode();
   }
@@ -330,7 +337,8 @@ private:
 /// simplify_type specializations - Allow casting operators to work directly on
 /// SDValues as if they were SDNode*'s.
 template<> struct simplify_type<SDUse> {
-  typedef SDNode* SimpleType;
+  using SimpleType = SDNode *;
+
   static SimpleType getSimplifiedValue(SDUse &Val) {
     return Val.getNode();
   }
@@ -695,10 +703,10 @@ public:
     explicit use_iterator(SDUse *op) : Op(op) {}
 
   public:
-    typedef std::iterator<std::forward_iterator_tag,
-                          SDUse, ptrdiff_t>::reference reference;
-    typedef std::iterator<std::forward_iterator_tag,
-                          SDUse, ptrdiff_t>::pointer pointer;
+    using reference = std::iterator<std::forward_iterator_tag,
+                                    SDUse, ptrdiff_t>::reference;
+    using pointer = std::iterator<std::forward_iterator_tag,
+                                  SDUse, ptrdiff_t>::pointer;
 
     use_iterator() = default;
     use_iterator(const use_iterator &I) : Op(I.Op) {}
@@ -824,7 +832,7 @@ public:
     return OperandList[Num];
   }
 
-  typedef SDUse* op_iterator;
+  using op_iterator = SDUse *;
 
   op_iterator op_begin() const { return OperandList; }
   op_iterator op_end() const { return OperandList+NumOperands; }
@@ -896,7 +904,8 @@ public:
     return getValueType(ResNo).getSizeInBits();
   }
 
-  typedef const EVT* value_iterator;
+  using value_iterator = const EVT *;
+
   value_iterator value_begin() const { return ValueList; }
   value_iterator value_end() const { return ValueList+NumValues; }
 
@@ -1822,8 +1831,7 @@ class BlockAddressSDNode : public SDNode {
   BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
                      int64_t o, unsigned char Flags)
     : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)),
-             BA(ba), Offset(o), TargetFlags(Flags) {
-  }
+             BA(ba), Offset(o), TargetFlags(Flags) {}
 
 public:
   const BlockAddress *getBlockAddress() const { return BA; }
@@ -2154,7 +2162,7 @@ public:
 /// instruction selection proper phase.
 class MachineSDNode : public SDNode {
 public:
-  typedef MachineMemOperand **mmo_iterator;
+  using mmo_iterator = MachineMemOperand **;
 
 private:
   friend class SelectionDAG;
@@ -2226,8 +2234,8 @@ public:
 };
 
 template <> struct GraphTraits<SDNode*> {
-  typedef SDNode *NodeRef;
-  typedef SDNodeIterator ChildIteratorType;
+  using NodeRef = SDNode *;
+  using ChildIteratorType = SDNodeIterator;
 
   static NodeRef getEntryNode(SDNode *N) { return N; }
 
@@ -2244,12 +2252,12 @@ template <> struct GraphTraits<SDNode*> {
 ///
 /// This needs to be a union because the largest node differs on 32 bit systems
 /// with 4 and 8 byte pointer alignment, respectively.
-typedef AlignedCharArrayUnion<AtomicSDNode, TargetIndexSDNode,
-                              BlockAddressSDNode, GlobalAddressSDNode>
-    LargestSDNode;
+using LargestSDNode = AlignedCharArrayUnion<AtomicSDNode, TargetIndexSDNode,
+                                            BlockAddressSDNode,
+                                            GlobalAddressSDNode>;
 
 /// The SDNode class with the greatest alignment requirement.
-typedef GlobalAddressSDNode MostAlignedSDNode;
+using MostAlignedSDNode = GlobalAddressSDNode;
 
 namespace ISD {
 
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 14fc3a499a08..a275b2721b44 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -20,17 +20,26 @@
 #define LLVM_CODEGEN_SLOTINDEXES_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ilist.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/ilist.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
 
 namespace llvm {
 
+class raw_ostream;
+
   /// This class represents an entry in the slot index list held in the
   /// SlotIndexes pass. It should not be used directly. See the
   /// SlotIndex & SlotIndexes classes for the public interface to this
@@ -40,7 +49,6 @@ namespace llvm {
     unsigned index;
 
   public:
-
     IndexListEntry(MachineInstr *mi, unsigned index) : mi(mi), index(index) {}
 
     MachineInstr* getInstr() const { return mi; }
@@ -301,7 +309,7 @@ namespace llvm {
     return os;
   }
 
-  typedef std::pair<SlotIndex, MachineBasicBlock*> IdxMBBPair;
+  using IdxMBBPair = std::pair<SlotIndex, MachineBasicBlock *>;
 
   inline bool operator<(SlotIndex V, const IdxMBBPair &IM) {
     return V < IM.first;
@@ -325,7 +333,7 @@ namespace llvm {
     // IndexListEntry allocator.
     BumpPtrAllocator ileAllocator;
 
-    typedef ilist<IndexListEntry> IndexList;
+    using IndexList = ilist<IndexListEntry>;
     IndexList indexList;
 
 #ifdef EXPENSIVE_CHECKS
@@ -334,7 +342,7 @@ namespace llvm {
 
     MachineFunction *mf;
 
-    typedef DenseMap<const MachineInstr*, SlotIndex> Mi2IndexMap;
+    using Mi2IndexMap = DenseMap<const MachineInstr *, SlotIndex>;
     Mi2IndexMap mi2iMap;
 
     /// MBBRanges - Map MBB number to (start, stop) indexes.
@@ -436,7 +444,7 @@ namespace llvm {
       const MachineBasicBlock *MBB = MI.getParent();
       assert(MBB && "MI must be inserted inna basic block");
       MachineBasicBlock::const_iterator I = MI, B = MBB->begin();
-      for (;;) {
+      while (true) {
         if (I == B)
           return getMBBStartIdx(MBB);
         --I;
@@ -453,7 +461,7 @@ namespace llvm {
       const MachineBasicBlock *MBB = MI.getParent();
       assert(MBB && "MI must be inserted inna basic block");
       MachineBasicBlock::const_iterator I = MI, E = MBB->end();
-      for (;;) {
+      while (true) {
         ++I;
         if (I == E)
           return getMBBEndIdx(MBB);
@@ -497,21 +505,25 @@ namespace llvm {
 
     /// Iterator over the idx2MBBMap (sorted pairs of slot index of basic block
     /// begin and basic block)
-    typedef SmallVectorImpl<IdxMBBPair>::const_iterator MBBIndexIterator;
+    using MBBIndexIterator = SmallVectorImpl<IdxMBBPair>::const_iterator;
+
     /// Move iterator to the next IdxMBBPair where the SlotIndex is greater or
     /// equal to \p To.
     MBBIndexIterator advanceMBBIndex(MBBIndexIterator I, SlotIndex To) const {
       return std::lower_bound(I, idx2MBBMap.end(), To);
     }
+
     /// Get an iterator pointing to the IdxMBBPair with the biggest SlotIndex
     /// that is greater or equal to \p Idx.
     MBBIndexIterator findMBBIndex(SlotIndex Idx) const {
       return advanceMBBIndex(idx2MBBMap.begin(), Idx);
     }
+
     /// Returns an iterator for the begin of the idx2MBBMap.
     MBBIndexIterator MBBIndexBegin() const {
       return idx2MBBMap.begin();
     }
+
     /// Return an iterator for the end of the idx2MBBMap.
     MBBIndexIterator MBBIndexEnd() const {
       return idx2MBBMap.end();
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index a18936feea7b..8263946ed928 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -145,21 +145,27 @@ public:
 ///
 /// Statepoint operands take the form:
 ///   <id>, <num patch bytes >, <num call arguments>, <call target>,
-///   [call arguments], <StackMaps::ConstantOp>, <calling convention>,
+///   [call arguments...],
+///   <StackMaps::ConstantOp>, <calling convention>,
 ///   <StackMaps::ConstantOp>, <statepoint flags>,
-///   <StackMaps::ConstantOp>, <num other args>, [other args],
-///   [gc values]
+///   <StackMaps::ConstantOp>, <num deopt args>, [deopt args...],
+///   <gc base/derived pairs...> <gc allocas...>
+/// Note that the last two sets of arguments are not currently length
+///   prefixed.
 class StatepointOpers {
-private:
+  // TODO:: we should change the STATEPOINT representation so that CC and
+  // Flags should be part of meta operands, with args and deopt operands, and
+  // gc operands all prefixed by their length and a type code. This would be
+  // much more consistent. 
+public:
   // These values are aboolute offsets into the operands of the statepoint
   // instruction.
   enum { IDPos, NBytesPos, NCallArgsPos, CallTargetPos, MetaEnd };
 
   // These values are relative offests from the start of the statepoint meta
   // arguments (i.e. the end of the call arguments).
-  enum { CCOffset = 1, FlagsOffset = 3, NumVMSArgsOffset = 5 };
+  enum { CCOffset = 1, FlagsOffset = 3, NumDeoptOperandsOffset = 5 };
 
-public:
   explicit StatepointOpers(const MachineInstr *MI) : MI(MI) {}
 
   /// Get starting index of non call related arguments
@@ -220,7 +226,7 @@ public:
   // OpTypes are used to encode information about the following logical
   // operand (which may consist of several MachineOperands) for the
   // OpParser.
-  typedef enum { DirectMemRefOp, IndirectMemRefOp, ConstantOp } OpType;
+  using OpType = enum { DirectMemRefOp, IndirectMemRefOp, ConstantOp };
 
   StackMaps(AsmPrinter &AP);
 
@@ -248,9 +254,10 @@ public:
 
 private:
   static const char *WSMP;
-  typedef SmallVector<Location, 8> LocationVec;
-  typedef SmallVector<LiveOutReg, 8> LiveOutVec;
-  typedef MapVector<uint64_t, uint64_t> ConstantPool;
+
+  using LocationVec = SmallVector<Location, 8>;
+  using LiveOutVec = SmallVector<LiveOutReg, 8>;
+  using ConstantPool = MapVector<uint64_t, uint64_t>;
 
   struct FunctionInfo {
     uint64_t StackSize = 0;
@@ -273,8 +280,8 @@ private:
           LiveOuts(std::move(LiveOuts)) {}
   };
 
-  typedef MapVector<const MCSymbol *, FunctionInfo> FnInfoMap;
-  typedef std::vector<CallsiteInfo> CallsiteInfoList;
+  using FnInfoMap = MapVector<const MCSymbol *, FunctionInfo>;
+  using CallsiteInfoList = std::vector<CallsiteInfo>;
 
   AsmPrinter &AP;
   CallsiteInfoList CSInfos;
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index 1992412120aa..4365fca74bf1 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -55,6 +55,9 @@ public:
   /// Return the MCSchedClassDesc for this instruction.
   const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
 
+  /// \brief TargetSubtargetInfo getter.
+  const TargetSubtargetInfo *getSubtargetInfo() const { return STI; }
+
   /// \brief TargetInstrInfo getter.
   const TargetInstrInfo *getInstrInfo() const { return TII; }
 
diff --git a/include/llvm/CodeGen/WinEHFuncInfo.h b/include/llvm/CodeGen/WinEHFuncInfo.h
index dd730495a5f6..8043024626a0 100644
--- a/include/llvm/CodeGen/WinEHFuncInfo.h
+++ b/include/llvm/CodeGen/WinEHFuncInfo.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/WinEHFuncInfo.h ----------------------------*- C++ -*-===//
+//===- llvm/CodeGen/WinEHFuncInfo.h -----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,28 +17,26 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/IR/Instructions.h"
+#include <cstdint>
+#include <limits>
+#include <utility>
 
 namespace llvm {
+
 class AllocaInst;
 class BasicBlock;
-class CatchReturnInst;
-class Constant;
+class FuncletPadInst;
 class Function;
 class GlobalVariable;
+class Instruction;
 class InvokeInst;
-class IntrinsicInst;
-class LandingPadInst;
-class MCExpr;
-class MCSymbol;
 class MachineBasicBlock;
-class Value;
+class MCSymbol;
 
 // The following structs respresent the .xdata tables for various
 // Windows-related EH personalities.
 
-typedef PointerUnion<const BasicBlock *, MachineBasicBlock *> MBBOrBasicBlock;
+using MBBOrBasicBlock = PointerUnion<const BasicBlock *, MachineBasicBlock *>;
 
 struct CxxUnwindMapEntry {
   int ToState;
@@ -99,18 +97,18 @@ struct WinEHFuncInfo {
   SmallVector<WinEHTryBlockMapEntry, 4> TryBlockMap;
   SmallVector<SEHUnwindMapEntry, 4> SEHUnwindMap;
   SmallVector<ClrEHUnwindMapEntry, 4> ClrEHUnwindMap;
-  int UnwindHelpFrameIdx = INT_MAX;
-  int PSPSymFrameIdx = INT_MAX;
+  int UnwindHelpFrameIdx = std::numeric_limits<int>::max();
+  int PSPSymFrameIdx = std::numeric_limits<int>::max();
 
   int getLastStateNumber() const { return CxxUnwindMap.size() - 1; }
 
   void addIPToStateRange(const InvokeInst *II, MCSymbol *InvokeBegin,
                          MCSymbol *InvokeEnd);
 
-  int EHRegNodeFrameIndex = INT_MAX;
-  int EHRegNodeEndOffset = INT_MAX;
-  int EHGuardFrameIndex = INT_MAX;
-  int SEHSetFrameOffset = INT_MAX;
+  int EHRegNodeFrameIndex = std::numeric_limits<int>::max();
+  int EHRegNodeEndOffset = std::numeric_limits<int>::max();
+  int EHGuardFrameIndex = std::numeric_limits<int>::max();
+  int SEHSetFrameOffset = std::numeric_limits<int>::max();
 
   WinEHFuncInfo();
 };
@@ -125,5 +123,7 @@ void calculateSEHStateNumbers(const Function *ParentFn,
                               WinEHFuncInfo &FuncInfo);
 
 void calculateClrEHStateNumbers(const Function *Fn, WinEHFuncInfo &FuncInfo);
-}
+
+} // end namespace llvm
+
 #endif // LLVM_CODEGEN_WINEHFUNCINFO_H
diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h
index 4e8c8feb7a12..9890263ae2d2 100644
--- a/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -574,6 +574,14 @@ struct FrameData {
     IsFunctionStart = 1 << 2,
   };
 };
+
+enum class CodeViewContainer { ObjectFile, Pdb };
+
+inline uint32_t alignOf(CodeViewContainer Container) {
+  if (Container == CodeViewContainer::ObjectFile)
+    return 1;
+  return 4;
+}
 }
 }
 
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index b3976826a316..db944c7057f7 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -136,6 +136,7 @@ public:
   Error mapByteVectorTail(ArrayRef<uint8_t> &Bytes);
   Error mapByteVectorTail(std::vector<uint8_t> &Bytes);
 
+  Error padToAlignment(uint32_t Align);
   Error skipPadding();
 
 private:
diff --git a/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h b/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
index e7036033d2d9..c958a95ee6de 100644
--- a/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
+++ b/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
@@ -60,8 +60,8 @@ public:
   Error initialize(BinaryStreamReader Reader);
   Error initialize(BinaryStreamRef Stream);
 
-  Iterator begin() { return Checksums.begin(); }
-  Iterator end() { return Checksums.end(); }
+  Iterator begin() const { return Checksums.begin(); }
+  Iterator end() const { return Checksums.end(); }
 
   const FileChecksumArray &getArray() const { return Checksums; }
 
diff --git a/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h b/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
index e2cfc3c99233..60440700c265 100644
--- a/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
+++ b/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
@@ -74,8 +74,13 @@ private:
 
 class DebugInlineeLinesSubsection final : public DebugSubsection {
 public:
+  struct Entry {
+    std::vector<support::ulittle32_t> ExtraFiles;
+    InlineeSourceLineHeader Header;
+  };
+
   DebugInlineeLinesSubsection(DebugChecksumsSubsection &Checksums,
-                              bool HasExtraFiles);
+                              bool HasExtraFiles = false);
 
   static bool classof(const DebugSubsection *S) {
     return S->kind() == DebugSubsectionKind::InlineeLines;
@@ -87,16 +92,18 @@ public:
   void addInlineSite(TypeIndex FuncId, StringRef FileName, uint32_t SourceLine);
   void addExtraFile(StringRef FileName);
 
+  bool hasExtraFiles() const { return HasExtraFiles; }
+  void setHasExtraFiles(bool Has) { HasExtraFiles = Has; }
+
+  std::vector<Entry>::const_iterator begin() const { return Entries.begin(); }
+  std::vector<Entry>::const_iterator end() const { return Entries.end(); }
+
 private:
   DebugChecksumsSubsection &Checksums;
 
   bool HasExtraFiles = false;
   uint32_t ExtraFileCount = 0;
 
-  struct Entry {
-    std::vector<support::ulittle32_t> ExtraFiles;
-    InlineeSourceLineHeader Header;
-  };
   std::vector<Entry> Entries;
 };
 }
diff --git a/include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h b/include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h
index b2e1131e5968..847259c5ceac 100644
--- a/include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h
+++ b/include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h
@@ -31,28 +31,32 @@ struct DebugSubsectionHeader {
 class DebugSubsectionRecord {
 public:
   DebugSubsectionRecord();
-  DebugSubsectionRecord(DebugSubsectionKind Kind, BinaryStreamRef Data);
+  DebugSubsectionRecord(DebugSubsectionKind Kind, BinaryStreamRef Data,
+                        CodeViewContainer Container);
 
-  static Error initialize(BinaryStreamRef Stream, DebugSubsectionRecord &Info);
+  static Error initialize(BinaryStreamRef Stream, DebugSubsectionRecord &Info,
+                          CodeViewContainer Container);
 
   uint32_t getRecordLength() const;
   DebugSubsectionKind kind() const;
   BinaryStreamRef getRecordData() const;
 
 private:
+  CodeViewContainer Container;
   DebugSubsectionKind Kind;
   BinaryStreamRef Data;
 };
 
 class DebugSubsectionRecordBuilder {
 public:
-  DebugSubsectionRecordBuilder(DebugSubsectionKind Kind, DebugSubsection &Frag);
+  DebugSubsectionRecordBuilder(std::unique_ptr<DebugSubsection> Subsection,
+                               CodeViewContainer Container);
   uint32_t calculateSerializedLength();
   Error commit(BinaryStreamWriter &Writer);
 
 private:
-  DebugSubsectionKind Kind;
-  DebugSubsection &Frag;
+  std::unique_ptr<DebugSubsection> Subsection;
+  CodeViewContainer Container;
 };
 
 } // namespace codeview
@@ -62,7 +66,12 @@ template <> struct VarStreamArrayExtractor<codeview::DebugSubsectionRecord> {
 
   static Error extract(BinaryStreamRef Stream, uint32_t &Length,
                        codeview::DebugSubsectionRecord &Info) {
-    if (auto EC = codeview::DebugSubsectionRecord::initialize(Stream, Info))
+    // FIXME: We need to pass the container type through to this function, but
+    // VarStreamArray doesn't easily support stateful contexts.  In practice
+    // this isn't super important since the subsection header describes its
+    // length and we can just skip it.  It's more important when writing.
+    if (auto EC = codeview::DebugSubsectionRecord::initialize(
+            Stream, Info, codeview::CodeViewContainer::Pdb))
       return EC;
     Length = Info.getRecordLength();
     return Error::success();
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
index 428ff153d5d1..7080b0480757 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
@@ -24,9 +24,9 @@ namespace codeview {
 class SymbolVisitorDelegate;
 class SymbolDeserializer : public SymbolVisitorCallbacks {
   struct MappingInfo {
-    explicit MappingInfo(ArrayRef<uint8_t> RecordData)
+    MappingInfo(ArrayRef<uint8_t> RecordData, CodeViewContainer Container)
         : Stream(RecordData, llvm::support::little), Reader(Stream),
-          Mapping(Reader) {}
+          Mapping(Reader, Container) {}
 
     BinaryByteStream Stream;
     BinaryStreamReader Reader;
@@ -35,7 +35,9 @@ class SymbolDeserializer : public SymbolVisitorCallbacks {
 
 public:
   template <typename T> static Error deserializeAs(CVSymbol Symbol, T &Record) {
-    SymbolDeserializer S(nullptr);
+    // If we're just deserializing one record, then don't worry about alignment
+    // as there's nothing that comes after.
+    SymbolDeserializer S(nullptr, CodeViewContainer::ObjectFile);
     if (auto EC = S.visitSymbolBegin(Symbol))
       return EC;
     if (auto EC = S.visitKnownRecord(Symbol, Record))
@@ -45,12 +47,13 @@ public:
     return Error::success();
   }
 
-  explicit SymbolDeserializer(SymbolVisitorDelegate *Delegate)
-      : Delegate(Delegate) {}
+  explicit SymbolDeserializer(SymbolVisitorDelegate *Delegate,
+                              CodeViewContainer Container)
+      : Delegate(Delegate), Container(Container) {}
 
   Error visitSymbolBegin(CVSymbol &Record) override {
     assert(!Mapping && "Already in a symbol mapping!");
-    Mapping = llvm::make_unique<MappingInfo>(Record.content());
+    Mapping = llvm::make_unique<MappingInfo>(Record.content(), Container);
     return Mapping->Mapping.visitSymbolBegin(Record);
   }
   Error visitSymbolEnd(CVSymbol &Record) override {
@@ -78,6 +81,7 @@ private:
   }
 
   SymbolVisitorDelegate *Delegate;
+  CodeViewContainer Container;
   std::unique_ptr<MappingInfo> Mapping;
 };
 }
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDumper.h b/include/llvm/DebugInfo/CodeView/SymbolDumper.h
index e91065dcf87e..293daa851bdd 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDumper.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDumper.h
@@ -26,9 +26,11 @@ class TypeCollection;
 class CVSymbolDumper {
 public:
   CVSymbolDumper(ScopedPrinter &W, TypeCollection &Types,
+                 CodeViewContainer Container,
                  std::unique_ptr<SymbolDumpDelegate> ObjDelegate,
                  bool PrintRecordBytes)
-      : W(W), Types(Types), ObjDelegate(std::move(ObjDelegate)),
+      : W(W), Types(Types), Container(Container),
+        ObjDelegate(std::move(ObjDelegate)),
         PrintRecordBytes(PrintRecordBytes) {}
 
   /// Dumps one type record.  Returns false if there was a type parsing error,
@@ -44,6 +46,7 @@ public:
 private:
   ScopedPrinter &W;
   TypeCollection &Types;
+  CodeViewContainer Container;
   std::unique_ptr<SymbolDumpDelegate> ObjDelegate;
 
   bool PrintRecordBytes;
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h
index 5d072a3b2723..391e8f127665 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h
@@ -20,8 +20,12 @@ class BinaryStreamWriter;
 namespace codeview {
 class SymbolRecordMapping : public SymbolVisitorCallbacks {
 public:
-  explicit SymbolRecordMapping(BinaryStreamReader &Reader) : IO(Reader) {}
-  explicit SymbolRecordMapping(BinaryStreamWriter &Writer) : IO(Writer) {}
+  explicit SymbolRecordMapping(BinaryStreamReader &Reader,
+                               CodeViewContainer Container)
+      : IO(Reader), Container(Container) {}
+  explicit SymbolRecordMapping(BinaryStreamWriter &Writer,
+                               CodeViewContainer Container)
+      : IO(Writer), Container(Container) {}
 
   Error visitSymbolBegin(CVSymbol &Record) override;
   Error visitSymbolEnd(CVSymbol &Record) override;
@@ -35,6 +39,7 @@ private:
   Optional<SymbolKind> Kind;
 
   CodeViewRecordIO IO;
+  CodeViewContainer Container;
 };
 }
 }
diff --git a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
index a8fe1a3ae1d0..42adbdb4e20f 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
@@ -46,17 +46,18 @@ class SymbolSerializer : public SymbolVisitorCallbacks {
 
 public:
   template <typename SymType>
-  static CVSymbol writeOneSymbol(SymType &Sym, BumpPtrAllocator &Storage) {
+  static CVSymbol writeOneSymbol(SymType &Sym, BumpPtrAllocator &Storage,
+                                 CodeViewContainer Container) {
     CVSymbol Result;
     Result.Type = static_cast<SymbolKind>(Sym.Kind);
-    SymbolSerializer Serializer(Storage);
+    SymbolSerializer Serializer(Storage, Container);
     consumeError(Serializer.visitSymbolBegin(Result));
     consumeError(Serializer.visitKnownRecord(Result, Sym));
     consumeError(Serializer.visitSymbolEnd(Result));
     return Result;
   }
 
-  explicit SymbolSerializer(BumpPtrAllocator &Storage);
+  SymbolSerializer(BumpPtrAllocator &Storage, CodeViewContainer Container);
 
   virtual Error visitSymbolBegin(CVSymbol &Record) override;
   virtual Error visitSymbolEnd(CVSymbol &Record) override;
diff --git a/include/llvm/DebugInfo/MSF/MappedBlockStream.h b/include/llvm/DebugInfo/MSF/MappedBlockStream.h
index d68f5f70c83e..36dce393fc66 100644
--- a/include/llvm/DebugInfo/MSF/MappedBlockStream.h
+++ b/include/llvm/DebugInfo/MSF/MappedBlockStream.h
@@ -44,17 +44,19 @@ class MappedBlockStream : public BinaryStream {
 public:
   static std::unique_ptr<MappedBlockStream>
   createStream(uint32_t BlockSize, const MSFStreamLayout &Layout,
-               BinaryStreamRef MsfData);
+               BinaryStreamRef MsfData, BumpPtrAllocator &Allocator);
 
   static std::unique_ptr<MappedBlockStream>
   createIndexedStream(const MSFLayout &Layout, BinaryStreamRef MsfData,
-                      uint32_t StreamIndex);
+                      uint32_t StreamIndex, BumpPtrAllocator &Allocator);
 
   static std::unique_ptr<MappedBlockStream>
-  createFpmStream(const MSFLayout &Layout, BinaryStreamRef MsfData);
+  createFpmStream(const MSFLayout &Layout, BinaryStreamRef MsfData,
+                  BumpPtrAllocator &Allocator);
 
   static std::unique_ptr<MappedBlockStream>
-  createDirectoryStream(const MSFLayout &Layout, BinaryStreamRef MsfData);
+  createDirectoryStream(const MSFLayout &Layout, BinaryStreamRef MsfData,
+                        BumpPtrAllocator &Allocator);
 
   llvm::support::endianness getEndian() const override {
     return llvm::support::little;
@@ -67,9 +69,7 @@ public:
 
   uint32_t getLength() override;
 
-  uint32_t getNumBytesCopied() const;
-
-  llvm::BumpPtrAllocator &getAllocator() { return Pool; }
+  llvm::BumpPtrAllocator &getAllocator() { return Allocator; }
 
   void invalidateCache();
 
@@ -79,7 +79,7 @@ public:
 
 protected:
   MappedBlockStream(uint32_t BlockSize, const MSFStreamLayout &StreamLayout,
-                    BinaryStreamRef MsfData);
+                    BinaryStreamRef MsfData, BumpPtrAllocator &Allocator);
 
 private:
   const MSFStreamLayout &getStreamLayout() const { return StreamLayout; }
@@ -94,7 +94,15 @@ private:
   BinaryStreamRef MsfData;
 
   typedef MutableArrayRef<uint8_t> CacheEntry;
-  llvm::BumpPtrAllocator Pool;
+
+  // We just store the allocator by reference.  We use this to allocate
+  // contiguous memory for things like arrays or strings that cross a block
+  // boundary, and this memory is expected to outlive the stream.  For example,
+  // someone could create a stream, read some stuff, then close the stream, and
+  // we would like outstanding references to fields to remain valid since the
+  // entire file is mapped anyway.  Because of that, the user must supply the
+  // allocator to allocate broken records from.
+  BumpPtrAllocator &Allocator;
   DenseMap<uint32_t, std::vector<CacheEntry>> CacheMap;
 };
 
@@ -102,18 +110,20 @@ class WritableMappedBlockStream : public WritableBinaryStream {
 public:
   static std::unique_ptr<WritableMappedBlockStream>
   createStream(uint32_t BlockSize, const MSFStreamLayout &Layout,
-               WritableBinaryStreamRef MsfData);
+               WritableBinaryStreamRef MsfData, BumpPtrAllocator &Allocator);
 
   static std::unique_ptr<WritableMappedBlockStream>
   createIndexedStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData,
-                      uint32_t StreamIndex);
+                      uint32_t StreamIndex, BumpPtrAllocator &Allocator);
 
   static std::unique_ptr<WritableMappedBlockStream>
   createDirectoryStream(const MSFLayout &Layout,
-                        WritableBinaryStreamRef MsfData);
+                        WritableBinaryStreamRef MsfData,
+                        BumpPtrAllocator &Allocator);
 
   static std::unique_ptr<WritableMappedBlockStream>
-  createFpmStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData);
+  createFpmStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData,
+                  BumpPtrAllocator &Allocator);
 
   llvm::support::endianness getEndian() const override {
     return llvm::support::little;
@@ -139,7 +149,8 @@ public:
 protected:
   WritableMappedBlockStream(uint32_t BlockSize,
                             const MSFStreamLayout &StreamLayout,
-                            WritableBinaryStreamRef MsfData);
+                            WritableBinaryStreamRef MsfData,
+                            BumpPtrAllocator &Allocator);
 
 private:
   MappedBlockStream ReadInterface;
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index e5858d0f45e3..2ff166b24e68 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -49,11 +49,8 @@ public:
   void setObjFileName(StringRef Name);
   void addSymbol(codeview::CVSymbol Symbol);
 
-  void addC13Fragment(std::unique_ptr<codeview::DebugLinesSubsection> Lines);
-  void addC13Fragment(
-      std::unique_ptr<codeview::DebugInlineeLinesSubsection> Inlinees);
-  void setC13FileChecksums(
-      std::unique_ptr<codeview::DebugChecksumsSubsection> Checksums);
+  void
+  addDebugSubsection(std::unique_ptr<codeview::DebugSubsection> Subsection);
 
   uint16_t getStreamIndex() const;
   StringRef getModuleName() const { return ModuleName; }
@@ -83,10 +80,6 @@ private:
   std::vector<std::string> SourceFiles;
   std::vector<codeview::CVSymbol> Symbols;
 
-  std::unique_ptr<codeview::DebugChecksumsSubsection> ChecksumInfo;
-  std::vector<std::unique_ptr<codeview::DebugLinesSubsection>> LineInfo;
-  std::vector<std::unique_ptr<codeview::DebugInlineeLinesSubsection>> Inlinees;
-
   std::vector<std::unique_ptr<codeview::DebugSubsectionRecordBuilder>>
       C13Builders;
 
diff --git a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
index 822ce3ce13d3..a8121978d882 100644
--- a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
@@ -25,7 +26,7 @@ class PDBFile;
 class DbiModuleDescriptor;
 
 class ModuleDebugStreamRef {
-  typedef codeview::DebugSubsectionArray::Iterator LinesAndChecksumsIterator;
+  typedef codeview::DebugSubsectionArray::Iterator DebugSubsectionIterator;
 
 public:
   ModuleDebugStreamRef(const DbiModuleDescriptor &Module,
@@ -39,12 +40,15 @@ public:
   iterator_range<codeview::CVSymbolArray::Iterator>
   symbols(bool *HadError) const;
 
-  llvm::iterator_range<LinesAndChecksumsIterator> linesAndChecksums() const;
+  llvm::iterator_range<DebugSubsectionIterator> subsections() const;
 
-  bool hasLineInfo() const;
+  bool hasDebugSubsections() const;
 
   Error commit();
 
+  Expected<codeview::DebugChecksumsSubsectionRef>
+  findChecksumsSubsection() const;
+
 private:
   const DbiModuleDescriptor &Mod;
 
@@ -57,7 +61,7 @@ private:
   BinaryStreamRef C13LinesSubstream;
   BinaryStreamRef GlobalRefsSubstream;
 
-  codeview::DebugSubsectionArray LinesAndChecksums;
+  codeview::DebugSubsectionArray Subsections;
 };
 }
 }
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h b/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
index 6aeb0a5479cb..28a14d7356d2 100644
--- a/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
+++ b/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
@@ -45,6 +45,8 @@ public:
 
   FixedStreamArray<support::ulittle32_t> name_ids() const;
 
+  codeview::DebugStringTableSubsectionRef getStringTable() const;
+
 private:
   Error readHeader(BinaryStreamReader &Reader);
   Error readStrings(BinaryStreamReader &Reader);
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 17fba9991c2e..0ee697696ca5 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -34,8 +34,7 @@ class TpiStream {
   friend class TpiStreamBuilder;
 
 public:
-  TpiStream(const PDBFile &File,
-            std::unique_ptr<msf::MappedBlockStream> Stream);
+  TpiStream(PDBFile &File, std::unique_ptr<msf::MappedBlockStream> Stream);
   ~TpiStream();
   Error reload();
 
@@ -61,7 +60,7 @@ public:
   Error commit();
 
 private:
-  const PDBFile &Pdb;
+  PDBFile &Pdb;
   std::unique_ptr<msf::MappedBlockStream> Stream;
 
   std::unique_ptr<codeview::LazyRandomTypeCollection> Types;
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 4afb5d9d63b2..8e6bb4baccaf 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -86,6 +86,10 @@ namespace llvm {
     /// Construct any deferred debug info descriptors.
     void finalize();
 
+    /// Finalize a specific subprogram - no new variables may be added to this
+    /// subprogram afterwards.
+    void finalizeSubprogram(DISubprogram *SP);
+
     /// A CompileUnit provides an anchor for all debugging
     /// information generated during this instance of compilation.
     /// \param Lang          Source programming language, eg. dwarf::DW_LANG_C99
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index aa74f361cda2..eef1212abc4b 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -90,12 +90,6 @@ namespace llvm {
                                     DenseMap<const MDNode *, MDNode *> &Cache,
                                     bool ReplaceLast = false);
 
-    /// Reparent all debug locations referenced by \c I that belong to \c OrigSP
-    /// to become (possibly indirect) children of \c NewSP.
-    static void reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
-                                  DISubprogram *NewSP,
-                                  DenseMap<const MDNode *, MDNode *> &Cache);
-
     unsigned getLine() const;
     unsigned getCol() const;
     MDNode *getScope() const;
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index c46c609609e2..757ddf6cf46b 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -134,16 +134,18 @@ public:
     /// be renamed or references something that can't be renamed).
     unsigned NotEligibleToImport : 1;
 
-    /// Indicate that the global value must be considered a live root for
-    /// index-based liveness analysis. Used for special LLVM values such as
-    /// llvm.global_ctors that the linker does not know about.
-    unsigned LiveRoot : 1;
+    /// In per-module summary, indicate that the global value must be considered
+    /// a live root for index-based liveness analysis. Used for special LLVM
+    /// values such as llvm.global_ctors that the linker does not know about.
+    ///
+    /// In combined summary, indicate that the global value is live.
+    unsigned Live : 1;
 
     /// Convenience Constructors
     explicit GVFlags(GlobalValue::LinkageTypes Linkage,
-                     bool NotEligibleToImport, bool LiveRoot)
+                     bool NotEligibleToImport, bool Live)
         : Linkage(Linkage), NotEligibleToImport(NotEligibleToImport),
-          LiveRoot(LiveRoot) {}
+          Live(Live) {}
   };
 
 private:
@@ -172,6 +174,8 @@ private:
   /// are listed in the derived FunctionSummary object.
   std::vector<ValueInfo> RefEdgeList;
 
+  bool isLive() const { return Flags.Live; }
+
 protected:
   GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector<ValueInfo> Refs)
       : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {}
@@ -213,19 +217,17 @@ public:
   /// Return true if this global value can't be imported.
   bool notEligibleToImport() const { return Flags.NotEligibleToImport; }
 
-  /// Return true if this global value must be considered a root for live
-  /// value analysis on the index.
-  bool liveRoot() const { return Flags.LiveRoot; }
-
-  /// Flag that this global value must be considered a root for live
-  /// value analysis on the index.
-  void setLiveRoot() { Flags.LiveRoot = true; }
+  void setLive(bool Live) { Flags.Live = Live; }
 
   /// Flag that this global value cannot be imported.
   void setNotEligibleToImport() { Flags.NotEligibleToImport = true; }
 
   /// Return the list of values referenced by this global value definition.
   ArrayRef<ValueInfo> refs() const { return RefEdgeList; }
+
+  friend class ModuleSummaryIndex;
+  friend void computeDeadSymbols(class ModuleSummaryIndex &,
+                                 const DenseSet<GlobalValue::GUID> &);
 };
 
 /// \brief Alias summary information.
@@ -535,6 +537,11 @@ private:
   /// GUIDs, it will be mapped to 0.
   std::map<GlobalValue::GUID, GlobalValue::GUID> OidGuidMap;
 
+  /// Indicates that summary-based GlobalValue GC has run, and values with
+  /// GVFlags::Live==false are really dead. Otherwise, all values must be
+  /// considered live.
+  bool WithGlobalValueDeadStripping = false;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -550,6 +557,17 @@ public:
   const_gvsummary_iterator end() const { return GlobalValueMap.end(); }
   size_t size() const { return GlobalValueMap.size(); }
 
+  bool withGlobalValueDeadStripping() const {
+    return WithGlobalValueDeadStripping;
+  }
+  void setWithGlobalValueDeadStripping() {
+    WithGlobalValueDeadStripping = true;
+  }
+
+  bool isGlobalValueLive(const GlobalValueSummary *GVS) const {
+    return !WithGlobalValueDeadStripping || GVS->isLive();
+  }
+
   /// Return a ValueInfo for GUID if it exists, otherwise return ValueInfo().
   ValueInfo getValueInfo(GlobalValue::GUID GUID) const {
     auto I = GlobalValueMap.find(GUID);
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index 78fdb602027d..891d84c2dbca 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -128,6 +128,8 @@ template <> struct MappingTraits<TypeIdSummary> {
 };
 
 struct FunctionSummaryYaml {
+  unsigned Linkage;
+  bool NotEligibleToImport, Live;
   std::vector<uint64_t> TypeTests;
   std::vector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
       TypeCheckedLoadVCalls;
@@ -168,6 +170,9 @@ namespace yaml {
 
 template <> struct MappingTraits<FunctionSummaryYaml> {
   static void mapping(IO &io, FunctionSummaryYaml& summary) {
+    io.mapOptional("Linkage", summary.Linkage);
+    io.mapOptional("NotEligibleToImport", summary.NotEligibleToImport);
+    io.mapOptional("Live", summary.Live);
     io.mapOptional("TypeTests", summary.TypeTests);
     io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls);
     io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls);
@@ -199,12 +204,12 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
     }
     auto &Elem = V[KeyInt];
     for (auto &FSum : FSums) {
-      GlobalValueSummary::GVFlags GVFlags(GlobalValue::ExternalLinkage, false,
-                                          false);
       Elem.SummaryList.push_back(llvm::make_unique<FunctionSummary>(
-          GVFlags, 0, ArrayRef<ValueInfo>{},
-          ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests),
-          std::move(FSum.TypeTestAssumeVCalls),
+          GlobalValueSummary::GVFlags(
+              static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
+              FSum.NotEligibleToImport, FSum.Live),
+          0, ArrayRef<ValueInfo>{}, ArrayRef<FunctionSummary::EdgeTy>{},
+          std::move(FSum.TypeTests), std::move(FSum.TypeTestAssumeVCalls),
           std::move(FSum.TypeCheckedLoadVCalls),
           std::move(FSum.TypeTestAssumeConstVCalls),
           std::move(FSum.TypeCheckedLoadConstVCalls)));
@@ -216,8 +221,10 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
       for (auto &Sum : P.second.SummaryList) {
         if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get()))
           FSums.push_back(FunctionSummaryYaml{
-              FSum->type_tests(), FSum->type_test_assume_vcalls(),
-              FSum->type_checked_load_vcalls(),
+              FSum->flags().Linkage,
+              static_cast<bool>(FSum->flags().NotEligibleToImport),
+              static_cast<bool>(FSum->flags().Live), FSum->type_tests(),
+              FSum->type_test_assume_vcalls(), FSum->type_checked_load_vcalls(),
               FSum->type_test_assume_const_vcalls(),
               FSum->type_checked_load_const_vcalls()});
       }
@@ -231,6 +238,8 @@ template <> struct MappingTraits<ModuleSummaryIndex> {
   static void mapping(IO &io, ModuleSummaryIndex& index) {
     io.mapOptional("GlobalValueMap", index.GlobalValueMap);
     io.mapOptional("TypeIdMap", index.TypeIdMap);
+    io.mapOptional("WithGlobalValueDeadStripping",
+                   index.WithGlobalValueDeadStripping);
   }
 };
 
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index f01607614a0c..a5f0130f79f4 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -228,24 +228,24 @@ public:
     return cast<ConstantInt>(NumVMSArgs)->getZExtValue();
   }
 
-  typename CallSiteTy::arg_iterator vm_state_begin() const {
+  typename CallSiteTy::arg_iterator deopt_begin() const {
     auto I = gc_transition_args_end() + 1;
     assert((getCallSite().arg_end() - I) >= 0);
     return I;
   }
-  typename CallSiteTy::arg_iterator vm_state_end() const {
-    auto I = vm_state_begin() + getNumTotalVMSArgs();
+  typename CallSiteTy::arg_iterator deopt_end() const {
+    auto I = deopt_begin() + getNumTotalVMSArgs();
     assert((getCallSite().arg_end() - I) >= 0);
     return I;
   }
 
   /// range adapter for vm state arguments
-  iterator_range<arg_iterator> vm_state_args() const {
-    return make_range(vm_state_begin(), vm_state_end());
+  iterator_range<arg_iterator> deopt_operands() const {
+    return make_range(deopt_begin(), deopt_end());
   }
 
   typename CallSiteTy::arg_iterator gc_args_begin() const {
-    return vm_state_end();
+    return deopt_end();
   }
   typename CallSiteTy::arg_iterator gc_args_end() const {
     return getCallSite().arg_end();
@@ -289,8 +289,8 @@ public:
     (void)arg_end();
     (void)gc_transition_args_begin();
     (void)gc_transition_args_end();
-    (void)vm_state_begin();
-    (void)vm_state_end();
+    (void)deopt_begin();
+    (void)deopt_end();
     (void)gc_args_begin();
     (void)gc_args_end();
   }
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 5b9796d4fba6..abb0aa3e3caf 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -86,7 +86,6 @@ void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&);
 void initializeCFGPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGSimplifyPassPass(PassRegistry&);
-void initializeLateCFGSimplifyPassPass(PassRegistry&);
 void initializeCFGViewerLegacyPassPass(PassRegistry&);
 void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
 void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
@@ -144,8 +143,8 @@ void initializeGCMachineCodeAnalysisPass(PassRegistry&);
 void initializeGCModuleInfoPass(PassRegistry&);
 void initializeGCOVProfilerLegacyPassPass(PassRegistry&);
 void initializeGVNHoistLegacyPassPass(PassRegistry&);
-void initializeGVNSinkLegacyPassPass(PassRegistry&);
 void initializeGVNLegacyPassPass(PassRegistry&);
+void initializeGVNSinkLegacyPassPass(PassRegistry&);
 void initializeGlobalDCELegacyPassPass(PassRegistry&);
 void initializeGlobalMergePass(PassRegistry&);
 void initializeGlobalOptLegacyPassPass(PassRegistry&);
@@ -175,13 +174,14 @@ void initializeIntervalPartitionPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
 void initializeLCSSAVerificationPassPass(PassRegistry&);
 void initializeLCSSAWrapperPassPass(PassRegistry&);
+void initializeLateCFGSimplifyPassPass(PassRegistry&);
 void initializeLazyBlockFrequencyInfoPassPass(PassRegistry&);
 void initializeLazyBranchProbabilityInfoPassPass(PassRegistry&);
 void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry&);
+void initializeLazyValueInfoPrinterPass(PassRegistry&);
 void initializeLazyValueInfoWrapperPassPass(PassRegistry&);
 void initializeLegacyLICMPassPass(PassRegistry&);
 void initializeLegacyLoopSinkPassPass(PassRegistry&);
-void initializeLazyValueInfoPrinterPass(PassRegistry&);
 void initializeLegalizerPass(PassRegistry&);
 void initializeLibCallsShrinkWrapLegacyPassPass(PassRegistry&);
 void initializeLintPass(PassRegistry&);
@@ -195,8 +195,8 @@ void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoadCombinePass(PassRegistry&);
 void initializeLoadStoreVectorizerPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
-void initializeLocalizerPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
+void initializeLocalizerPass(PassRegistry&);
 void initializeLoopAccessLegacyAnalysisPass(PassRegistry&);
 void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
 void initializeLoopDeletionLegacyPassPass(PassRegistry&);
@@ -304,6 +304,7 @@ void initializeProcessImplicitDefsPass(PassRegistry&);
 void initializeProfileSummaryInfoWrapperPassPass(PassRegistry&);
 void initializePromoteLegacyPassPass(PassRegistry&);
 void initializePruneEHPass(PassRegistry&);
+void initializeRABasicPass(PassRegistry&);
 void initializeRAGreedyPass(PassRegistry&);
 void initializeReassociateLegacyPassPass(PassRegistry&);
 void initializeRegBankSelectPass(PassRegistry&);
@@ -327,8 +328,9 @@ void initializeSafeStackLegacyPassPass(PassRegistry&);
 void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
 void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
-void initializeScalarizerPass(PassRegistry&);
 void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
+void initializeScalarizerPass(PassRegistry&);
+void initializeScavengerTestPass(PassRegistry&);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
 void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
 void initializeShadowStackGCLoweringPass(PassRegistry&);
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index 5ba8492db8f5..73106f77ca55 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -46,6 +46,9 @@ struct Config {
   unsigned OptLevel = 2;
   bool DisableVerify = false;
 
+  /// Use the new pass manager
+  bool UseNewPM = false;
+
   /// Disable entirely the optimizer, including importing for ThinLTO
   bool CodeGenOnly = false;
 
diff --git a/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h b/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
index 6ddae2e2b41c..a6d4d404415f 100644
--- a/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
+++ b/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
@@ -17,12 +17,20 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/ObjectYAML/YAML.h"
 
 namespace llvm {
+
+namespace codeview {
+class DebugStringTableSubsection;
+class DebugStringTableSubsectionRef;
+class DebugChecksumsSubsectionRef;
+}
 namespace CodeViewYAML {
+
 namespace detail {
-struct C13FragmentBase;
+struct YAMLSubsectionBase;
 }
 
 struct SourceLineEntry {
@@ -74,18 +82,24 @@ struct InlineeInfo {
   std::vector<InlineeSite> Sites;
 };
 
-struct SourceFileInfo {
-  std::vector<SourceFileChecksumEntry> FileChecksums;
-  std::vector<SourceLineInfo> LineFragments;
-  std::vector<InlineeInfo> Inlinees;
-};
+struct YAMLDebugSubsection {
+  static Expected<YAMLDebugSubsection>
+  fromCodeViewSubection(const codeview::DebugStringTableSubsectionRef &Strings,
+                        const codeview::DebugChecksumsSubsectionRef &Checksums,
+                        const codeview::DebugSubsectionRecord &SS);
 
-struct C13DebugSection {
-  std::vector<detail::C13FragmentBase> Fragments;
+  std::shared_ptr<detail::YAMLSubsectionBase> Subsection;
 };
+
+Expected<std::vector<std::unique_ptr<codeview::DebugSubsection>>>
+convertSubsectionList(ArrayRef<YAMLDebugSubsection> Subsections,
+                      codeview::DebugStringTableSubsection &Strings);
+
 } // namespace CodeViewYAML
 } // namespace llvm
 
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileInfo)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::YAMLDebugSubsection)
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::YAMLDebugSubsection)
 
 #endif
diff --git a/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h b/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
index ee4e2ac9d404..9b411e8b074f 100644
--- a/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
+++ b/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
@@ -28,7 +28,9 @@ struct SymbolRecordBase;
 struct SymbolRecord {
   std::shared_ptr<detail::SymbolRecordBase> Symbol;
 
-  codeview::CVSymbol toCodeViewSymbol(BumpPtrAllocator &Allocator) const;
+  codeview::CVSymbol
+  toCodeViewSymbol(BumpPtrAllocator &Allocator,
+                   codeview::CodeViewContainer Container) const;
   static Expected<SymbolRecord> fromCodeViewSymbol(codeview::CVSymbol Symbol);
 };
 
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 3c181f0e511b..5c3bf88fbbfa 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -1361,10 +1361,6 @@ public:
     return false;
   }
 
-  bool isTemplateArg(StringRef Name) const {
-    return isTemplateArg(StringInit::get(Name));
-  }
-
   const RecordVal *getValue(const Init *Name) const {
     for (const RecordVal &Val : Values)
       if (Val.Name == Name) return &Val;
@@ -1388,10 +1384,6 @@ public:
     TemplateArgs.push_back(Name);
   }
 
-  void addTemplateArg(StringRef Name) {
-    addTemplateArg(StringInit::get(Name));
-  }
-
   void addValue(const RecordVal &RV) {
     assert(getValue(RV.getNameInit()) == nullptr && "Value already added!");
     Values.push_back(RV);
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index d66b6edc7a4f..de35cdf052e1 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -81,15 +81,11 @@ public:
 /// \p ExportLists contains for each Module the set of globals (GUID) that will
 /// be imported by another module, or referenced by such a function. I.e. this
 /// is the set of globals that need to be promoted/renamed appropriately.
-///
-/// \p DeadSymbols (optional) contains a list of GUID that are deemed "dead" and
-/// will be ignored for the purpose of importing.
 void ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
     const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     StringMap<FunctionImporter::ImportMapTy> &ImportLists,
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists,
-    const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr);
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists);
 
 /// Compute all the imports for the given module using the Index.
 ///
@@ -102,9 +98,9 @@ void ComputeCrossModuleImportForModule(
 /// Compute all the symbols that are "dead": i.e these that can't be reached
 /// in the graph from any of the given symbols listed in
 /// \p GUIDPreservedSymbols.
-DenseSet<GlobalValue::GUID>
-computeDeadSymbols(const ModuleSummaryIndex &Index,
-                   const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols);
+void computeDeadSymbols(
+    ModuleSummaryIndex &Index,
+    const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols);
 
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
 /// \p ModulePath.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 023d7af7f729..b6c6c091631d 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -177,6 +177,7 @@ struct SanitizerCoverageOptions {
   bool Use8bitCounters = false;
   bool TracePC = false;
   bool TracePCGuard = false;
+  bool Inline8bitCounters = false;
   bool NoPrune = false;
 
   SanitizerCoverageOptions() = default;
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 91c9d255302f..2a8b89d86282 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -36,6 +36,7 @@ class BasicBlock;
 class BlockFrequencyInfo;
 class CallInst;
 class CallGraph;
+class DebugInfoFinder;
 class DominatorTree;
 class Function;
 class Instruction;
@@ -110,7 +111,8 @@ struct ClonedCodeInfo {
 ///
 BasicBlock *CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                             const Twine &NameSuffix = "", Function *F = nullptr,
-                            ClonedCodeInfo *CodeInfo = nullptr);
+                            ClonedCodeInfo *CodeInfo = nullptr,
+                            DebugInfoFinder *DIFinder = nullptr);
 
 /// CloneFunction - Return a copy of the specified function and add it to that
 /// function's module.  Also, any references specified in the VMap are changed
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 6a1af87450c9..a906770dbb34 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1170,7 +1170,9 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
                                                 const DataLayout &DL,
                                                 const TargetLibraryInfo *TLI) {
   // fold: icmp (inttoptr x), null         -> icmp x, 0
+  // fold: icmp null, (inttoptr x)         -> icmp 0, x
   // fold: icmp (ptrtoint x), 0            -> icmp x, null
+  // fold: icmp 0, (ptrtoint x)            -> icmp null, x
   // fold: icmp (inttoptr x), (inttoptr y) -> icmp trunc/zext x, trunc/zext y
   // fold: icmp (ptrtoint x), (ptrtoint y) -> icmp x, y
   //
@@ -1240,6 +1242,11 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
       return ConstantFoldBinaryOpOperands(OpC, LHS, RHS, DL);
     }
+  } else if (isa<ConstantExpr>(Ops1)) {
+    // If RHS is a constant expression, but the left side isn't, swap the
+    // operands and try again.
+    Predicate = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)Predicate);
+    return ConstantFoldCompareInstOperands(Predicate, Ops1, Ops0, DL, TLI);
   }
 
   return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
diff --git a/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index 3da33ac71421..ed233d201537 100644
--- a/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -43,7 +43,7 @@ static cl::opt<unsigned>
 // The percent threshold for the direct-call target (this call site vs the
 // total call count) for it to be considered as the promotion target.
 static cl::opt<unsigned>
-    ICPPercentThreshold("icp-percent-threshold", cl::init(33), cl::Hidden,
+    ICPPercentThreshold("icp-percent-threshold", cl::init(30), cl::Hidden,
                         cl::ZeroOrMore,
                         cl::desc("The percentage threshold for the promotion"));
 
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 4702569126c6..77c87928728a 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -54,11 +54,6 @@ static cl::opt<int>
                           cl::init(45),
                           cl::desc("Threshold for inlining cold callsites"));
 
-static cl::opt<bool>
-    EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden,
-                            cl::init(false),
-                            cl::desc("Enable generic switch cost model"));
-
 // We introduce this threshold to help performance of instrumentation based
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
@@ -1015,83 +1010,68 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
     if (isa<ConstantInt>(V))
       return true;
 
-  if (EnableGenericSwitchCost) {
-    // Assume the most general case where the swith is lowered into
-    // either a jump table, bit test, or a balanced binary tree consisting of
-    // case clusters without merging adjacent clusters with the same
-    // destination. We do not consider the switches that are lowered with a mix
-    // of jump table/bit test/binary search tree. The cost of the switch is
-    // proportional to the size of the tree or the size of jump table range.
-
-    // Exit early for a large switch, assuming one case needs at least one
-    // instruction.
-    // FIXME: This is not true for a bit test, but ignore such case for now to
-    // save compile-time.
-    int64_t CostLowerBound =
-        std::min((int64_t)INT_MAX,
-                 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
-
-    if (CostLowerBound > Threshold) {
-      Cost = CostLowerBound;
-      return false;
-    }
+  // Assume the most general case where the swith is lowered into
+  // either a jump table, bit test, or a balanced binary tree consisting of
+  // case clusters without merging adjacent clusters with the same
+  // destination. We do not consider the switches that are lowered with a mix
+  // of jump table/bit test/binary search tree. The cost of the switch is
+  // proportional to the size of the tree or the size of jump table range.
+  //
+  // NB: We convert large switches which are just used to initialize large phi
+  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
+  // inlining those. It will prevent inlining in cases where the optimization
+  // does not (yet) fire.
 
-    unsigned JumpTableSize = 0;
-    unsigned NumCaseCluster =
-        TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
+  // Exit early for a large switch, assuming one case needs at least one
+  // instruction.
+  // FIXME: This is not true for a bit test, but ignore such case for now to
+  // save compile-time.
+  int64_t CostLowerBound =
+      std::min((int64_t)INT_MAX,
+               (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
 
-    // If suitable for a jump table, consider the cost for the table size and
-    // branch to destination.
-    if (JumpTableSize) {
-      int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
-                       4 * InlineConstants::InstrCost;
-      Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
-      return false;
-    }
+  if (CostLowerBound > Threshold) {
+    Cost = CostLowerBound;
+    return false;
+  }
 
-    // Considering forming a binary search, we should find the number of nodes
-    // which is same as the number of comparisons when lowered. For a given
-    // number of clusters, n, we can define a recursive function, f(n), to find
-    // the number of nodes in the tree. The recursion is :
-    // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
-    // and f(n) = n, when n <= 3.
-    // This will lead a binary tree where the leaf should be either f(2) or f(3)
-    // when n > 3.  So, the number of comparisons from leaves should be n, while
-    // the number of non-leaf should be :
-    //   2^(log2(n) - 1) - 1
-    //   = 2^log2(n) * 2^-1 - 1
-    //   = n / 2 - 1.
-    // Considering comparisons from leaf and non-leaf nodes, we can estimate the
-    // number of comparisons in a simple closed form :
-    //   n + n / 2 - 1 = n * 3 / 2 - 1
-    if (NumCaseCluster <= 3) {
-      // Suppose a comparison includes one compare and one conditional branch.
-      Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
-      return false;
-    }
-    int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
-    uint64_t SwitchCost =
-        ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
-    Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
+  unsigned JumpTableSize = 0;
+  unsigned NumCaseCluster =
+      TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
+
+  // If suitable for a jump table, consider the cost for the table size and
+  // branch to destination.
+  if (JumpTableSize) {
+    int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+                     4 * InlineConstants::InstrCost;
+    Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
     return false;
   }
 
-  // Use a simple switch cost model where we accumulate a cost proportional to
-  // the number of distinct successor blocks. This fan-out in the CFG cannot
-  // be represented for free even if we can represent the core switch as a
-  // jumptable that takes a single instruction.
-  ///
-  // NB: We convert large switches which are just used to initialize large phi
-  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
-  // inlining those. It will prevent inlining in cases where the optimization
-  // does not (yet) fire.
-  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
-  SuccessorBlocks.insert(SI.getDefaultDest());
-  for (auto Case : SI.cases())
-    SuccessorBlocks.insert(Case.getCaseSuccessor());
-  // Add cost corresponding to the number of distinct destinations. The first
-  // we model as free because of fallthrough.
-  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  // Considering forming a binary search, we should find the number of nodes
+  // which is same as the number of comparisons when lowered. For a given
+  // number of clusters, n, we can define a recursive function, f(n), to find
+  // the number of nodes in the tree. The recursion is :
+  // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+  // and f(n) = n, when n <= 3.
+  // This will lead a binary tree where the leaf should be either f(2) or f(3)
+  // when n > 3.  So, the number of comparisons from leaves should be n, while
+  // the number of non-leaf should be :
+  //   2^(log2(n) - 1) - 1
+  //   = 2^log2(n) * 2^-1 - 1
+  //   = n / 2 - 1.
+  // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+  // number of comparisons in a simple closed form :
+  //   n + n / 2 - 1 = n * 3 / 2 - 1
+  if (NumCaseCluster <= 3) {
+    // Suppose a comparison includes one compare and one conditional branch.
+    Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
+    return false;
+  }
+  int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
+  uint64_t SwitchCost =
+      ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+  Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
   return false;
 }
 
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index a2b9015a8a1d..6a9ae6440ace 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -662,13 +662,13 @@ namespace {
   bool solveBlockValuePHINode(LVILatticeVal &BBLV, PHINode *PN, BasicBlock *BB);
   bool solveBlockValueSelect(LVILatticeVal &BBLV, SelectInst *S,
                              BasicBlock *BB);
-  bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, Instruction *BBI,
+  bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, BinaryOperator *BBI,
                                BasicBlock *BB);
-  bool solveBlockValueCast(LVILatticeVal &BBLV, Instruction *BBI,
+  bool solveBlockValueCast(LVILatticeVal &BBLV, CastInst *CI,
                            BasicBlock *BB);
   void intersectAssumeOrGuardBlockValueConstantRange(Value *Val,
                                                      LVILatticeVal &BBLV,
-                                              Instruction *BBI);
+                                                     Instruction *BBI);
 
   void solve();
 
@@ -849,12 +849,12 @@ bool LazyValueInfoImpl::solveBlockValueImpl(LVILatticeVal &Res,
     return true;
   }
   if (BBI->getType()->isIntegerTy()) {
-    if (isa<CastInst>(BBI))
-      return solveBlockValueCast(Res, BBI, BB);
-    
+    if (auto *CI = dyn_cast<CastInst>(BBI))
+      return solveBlockValueCast(Res, CI, BB);
+
     BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
     if (BO && isa<ConstantInt>(BO->getOperand(1)))
-      return solveBlockValueBinaryOp(Res, BBI, BB);
+      return solveBlockValueBinaryOp(Res, BO, BB);
   }
 
   DEBUG(dbgs() << " compute BB '" << BB->getName()
@@ -1168,9 +1168,9 @@ bool LazyValueInfoImpl::solveBlockValueSelect(LVILatticeVal &BBLV,
 }
 
 bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV,
-                                             Instruction *BBI,
-                                             BasicBlock *BB) {
-  if (!BBI->getOperand(0)->getType()->isSized()) {
+                                            CastInst *CI,
+                                            BasicBlock *BB) {
+  if (!CI->getOperand(0)->getType()->isSized()) {
     // Without knowing how wide the input is, we can't analyze it in any useful
     // way.
     BBLV = LVILatticeVal::getOverdefined();
@@ -1180,7 +1180,7 @@ bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV,
   // Filter out casts we don't know how to reason about before attempting to
   // recurse on our operand.  This can cut a long search short if we know we're
   // not going to be able to get any useful information anways.
-  switch (BBI->getOpcode()) {
+  switch (CI->getOpcode()) {
   case Instruction::Trunc:
   case Instruction::SExt:
   case Instruction::ZExt:
@@ -1197,44 +1197,43 @@ bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV,
   // Figure out the range of the LHS.  If that fails, we still apply the
   // transfer rule on the full set since we may be able to locally infer
   // interesting facts.
-  if (!hasBlockValue(BBI->getOperand(0), BB))
-    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+  if (!hasBlockValue(CI->getOperand(0), BB))
+    if (pushBlockValue(std::make_pair(BB, CI->getOperand(0))))
       // More work to do before applying this transfer rule.
       return false;
 
   const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(BBI->getOperand(0)->getType());
+    DL.getTypeSizeInBits(CI->getOperand(0)->getType());
   ConstantRange LHSRange = ConstantRange(OperandBitWidth);
-  if (hasBlockValue(BBI->getOperand(0), BB)) {
-    LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(BBI->getOperand(0), LHSVal,
-                                                  BBI);
+  if (hasBlockValue(CI->getOperand(0), BB)) {
+    LVILatticeVal LHSVal = getBlockValue(CI->getOperand(0), BB);
+    intersectAssumeOrGuardBlockValueConstantRange(CI->getOperand(0), LHSVal,
+                                                  CI);
     if (LHSVal.isConstantRange())
       LHSRange = LHSVal.getConstantRange();
   }
 
-  const unsigned ResultBitWidth =
-    cast<IntegerType>(BBI->getType())->getBitWidth();
+  const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth();
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  auto CastOp = (Instruction::CastOps) BBI->getOpcode();
-  BBLV = LVILatticeVal::getRange(LHSRange.castOp(CastOp, ResultBitWidth));
+  BBLV = LVILatticeVal::getRange(LHSRange.castOp(CI->getOpcode(),
+                                                 ResultBitWidth));
   return true;
 }
 
 bool LazyValueInfoImpl::solveBlockValueBinaryOp(LVILatticeVal &BBLV,
-                                                 Instruction *BBI,
+                                                 BinaryOperator *BO,
                                                  BasicBlock *BB) {
 
-  assert(BBI->getOperand(0)->getType()->isSized() &&
+  assert(BO->getOperand(0)->getType()->isSized() &&
          "all operands to binary operators are sized");
 
   // Filter out operators we don't know how to reason about before attempting to
   // recurse on our operand(s).  This can cut a long search short if we know
-  // we're not going to be able to get any useful information anways.
-  switch (BBI->getOpcode()) {
+  // we're not going to be able to get any useful information anyways.
+  switch (BO->getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
@@ -1256,29 +1255,29 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(LVILatticeVal &BBLV,
   // Figure out the range of the LHS.  If that fails, use a conservative range,
   // but apply the transfer rule anyways.  This lets us pick up facts from
   // expressions like "and i32 (call i32 @foo()), 32"
-  if (!hasBlockValue(BBI->getOperand(0), BB))
-    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+  if (!hasBlockValue(BO->getOperand(0), BB))
+    if (pushBlockValue(std::make_pair(BB, BO->getOperand(0))))
       // More work to do before applying this transfer rule.
       return false;
 
   const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(BBI->getOperand(0)->getType());
+    DL.getTypeSizeInBits(BO->getOperand(0)->getType());
   ConstantRange LHSRange = ConstantRange(OperandBitWidth);
-  if (hasBlockValue(BBI->getOperand(0), BB)) {
-    LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(BBI->getOperand(0), LHSVal,
-                                                  BBI);
+  if (hasBlockValue(BO->getOperand(0), BB)) {
+    LVILatticeVal LHSVal = getBlockValue(BO->getOperand(0), BB);
+    intersectAssumeOrGuardBlockValueConstantRange(BO->getOperand(0), LHSVal,
+                                                  BO);
     if (LHSVal.isConstantRange())
       LHSRange = LHSVal.getConstantRange();
   }
 
-  ConstantInt *RHS = cast<ConstantInt>(BBI->getOperand(1));
+  ConstantInt *RHS = cast<ConstantInt>(BO->getOperand(1));
   ConstantRange RHSRange = ConstantRange(RHS->getValue());
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  auto BinOp = (Instruction::BinaryOps) BBI->getOpcode();
+  Instruction::BinaryOps BinOp = BO->getOpcode();
   BBLV = LVILatticeVal::getRange(LHSRange.binaryOp(BinOp, RHSRange));
   return true;
 }
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 26706f5509ba..3253f27c010d 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -275,7 +275,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // FIXME: refactor this to use the same code that inliner is using.
       F.isVarArg();
   GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
-                                    /* LiveRoot = */ false);
+                                    /* Live = */ false);
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(),
       TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(),
@@ -295,7 +295,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
   findRefEdges(Index, &V, RefEdges, Visited);
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
-                                    /* LiveRoot = */ false);
+                                    /* Live = */ false);
   auto GVarSummary =
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
   if (NonRenamableLocal)
@@ -308,7 +308,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
                     DenseSet<GlobalValue::GUID> &CantBePromoted) {
   bool NonRenamableLocal = isNonRenamableLocal(A);
   GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
-                                    /* LiveRoot = */ false);
+                                    /* Live = */ false);
   auto AS = llvm::make_unique<AliasSummary>(Flags, ArrayRef<ValueInfo>{});
   auto *Aliasee = A.getBaseObject();
   auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee);
@@ -323,7 +323,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
 static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) {
   if (ValueInfo VI = Index.getValueInfo(GlobalValue::getGUID(Name)))
     for (auto &Summary : VI.getSummaryList())
-      Summary->setLiveRoot();
+      Summary->setLive(true);
 }
 
 ModuleSummaryIndex llvm::buildModuleSummaryIndex(
@@ -423,8 +423,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
             return;
           assert(GV->isDeclaration() && "Def in module asm already has definition");
           GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage,
-                                              /* NotEligibleToImport */ true,
-                                              /* LiveRoot */ true);
+                                              /* NotEligibleToImport = */ true,
+                                              /* Live = */ true);
           CantBePromoted.insert(GlobalValue::getGUID(Name));
           // Create the appropriate summary type.
           if (isa<Function>(GV)) {
diff --git a/lib/Analysis/OrderedBasicBlock.cpp b/lib/Analysis/OrderedBasicBlock.cpp
index 0f0016f22cc0..a04c0aef04be 100644
--- a/lib/Analysis/OrderedBasicBlock.cpp
+++ b/lib/Analysis/OrderedBasicBlock.cpp
@@ -55,7 +55,7 @@ bool OrderedBasicBlock::comesBefore(const Instruction *A,
   assert(II != IE && "Instruction not found?");
   assert((Inst == A || Inst == B) && "Should find A or B");
   LastInstFound = II;
-  return Inst == A;
+  return Inst != B;
 }
 
 /// \brief Find out whether \p A dominates \p B, meaning whether \p A
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 82107cb18025..b38e6225c840 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/Analysis/RegionIterator.h"
+#include "llvm/IR/OptBisect.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -280,3 +281,18 @@ Pass *RegionPass::createPrinterPass(raw_ostream &O,
                                   const std::string &Banner) const {
   return new PrintRegionPass(Banner, O);
 }
+
+bool RegionPass::skipRegion(Region &R) const {
+  Function &F = *R.getEntry()->getParent();
+  if (!F.getContext().getOptBisect().shouldRunPass(this, R))
+    return true;
+
+  if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+    // Report this only once per function.
+    if (R.getEntry() == &F.getEntryBlock())
+      DEBUG(dbgs() << "Skipping pass '" << getPassName()
+            << "' on function " << F.getName() << "\n");
+    return true;
+  }
+  return false;
+}
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 686c94687669..fffa9045b2fd 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -865,11 +865,11 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits
   RawFlags = RawFlags >> 4;
   bool NotEligibleToImport = (RawFlags & 0x1) || Version < 3;
-  // The LiveRoot flag wasn't introduced until version 3. For dead stripping
+  // The Live flag wasn't introduced until version 3. For dead stripping
   // to work correctly on earlier versions, we must conservatively treat all
   // values as live.
-  bool LiveRoot = (RawFlags & 0x2) || Version < 3;
-  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, LiveRoot);
+  bool Live = (RawFlags & 0x2) || Version < 3;
+  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live);
 }
 
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index a402b4ddd462..9043b8c12d25 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -351,7 +351,8 @@ public:
   /// Calls the callback for each value GUID and summary to be written to
   /// bitcode. This hides the details of whether they are being pulled from the
   /// entire index or just those in a provided ModuleToSummariesForIndex map.
-  void forEachSummary(std::function<void(GVInfo)> Callback) {
+  template<typename Functor>
+  void forEachSummary(Functor Callback) {
     if (ModuleToSummariesForIndex) {
       for (auto &M : *ModuleToSummariesForIndex)
         for (auto &Summary : M.second)
@@ -363,6 +364,29 @@ public:
     }
   }
 
+  /// Calls the callback for each entry in the modulePaths StringMap that
+  /// should be written to the module path string table. This hides the details
+  /// of whether they are being pulled from the entire index or just those in a
+  /// provided ModuleToSummariesForIndex map.
+  template <typename Functor> void forEachModule(Functor Callback) {
+    if (ModuleToSummariesForIndex) {
+      for (const auto &M : *ModuleToSummariesForIndex) {
+        const auto &MPI = Index.modulePaths().find(M.first);
+        if (MPI == Index.modulePaths().end()) {
+          // This should only happen if the bitcode file was empty, in which
+          // case we shouldn't be importing (the ModuleToSummariesForIndex
+          // would only include the module we are writing and index for).
+          assert(ModuleToSummariesForIndex->size() == 1);
+          continue;
+        }
+        Callback(*MPI);
+      }
+    } else {
+      for (const auto &MPSE : Index.modulePaths())
+        Callback(MPSE);
+    }
+  }
+
   /// Main entry point for writing a combined index to bitcode.
   void write();
 
@@ -370,14 +394,6 @@ private:
   void writeModStrings();
   void writeCombinedGlobalValueSummary();
 
-  /// Indicates whether the provided \p ModulePath should be written into
-  /// the module string table, e.g. if full index written or if it is in
-  /// the provided subset.
-  bool doIncludeModule(StringRef ModulePath) {
-    return !ModuleToSummariesForIndex ||
-           ModuleToSummariesForIndex->count(ModulePath);
-  }
-
   Optional<unsigned> getValueId(GlobalValue::GUID ValGUID) {
     auto VMI = GUIDToValueIdMap.find(ValGUID);
     if (VMI == GUIDToValueIdMap.end())
@@ -864,7 +880,7 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
   uint64_t RawFlags = 0;
 
   RawFlags |= Flags.NotEligibleToImport; // bool
-  RawFlags |= (Flags.LiveRoot << 1);
+  RawFlags |= (Flags.Live << 1);
   // Linkage don't need to be remapped at that time for the summary. Any future
   // change to the getEncodedLinkage() function will need to be taken into
   // account here as well.
@@ -968,19 +984,18 @@ void ModuleBitcodeWriter::writeValueSymbolTableForwardDecl() {
 enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 };
 
 /// Determine the encoding to use for the given string name and length.
-static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) {
+static StringEncoding getStringEncoding(StringRef Str) {
   bool isChar6 = true;
-  for (const char *C = Str, *E = C + StrLen; C != E; ++C) {
+  for (char C : Str) {
     if (isChar6)
-      isChar6 = BitCodeAbbrevOp::isChar6(*C);
-    if ((unsigned char)*C & 128)
+      isChar6 = BitCodeAbbrevOp::isChar6(C);
+    if ((unsigned char)C & 128)
       // don't bother scanning the rest.
       return SE_Fixed8;
   }
   if (isChar6)
     return SE_Char6;
-  else
-    return SE_Fixed7;
+  return SE_Fixed7;
 }
 
 /// Emit top-level description of module, including target triple, inline asm,
@@ -1073,8 +1088,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   SmallVector<unsigned, 64> Vals;
   // Emit the module's source file name.
   {
-    StringEncoding Bits = getStringEncoding(M.getSourceFileName().data(),
-                                            M.getSourceFileName().size());
+    StringEncoding Bits = getStringEncoding(M.getSourceFileName());
     BitCodeAbbrevOp AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8);
     if (Bits == SE_Char6)
       AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Char6);
@@ -2790,8 +2804,7 @@ void ModuleBitcodeWriter::writeFunctionLevelValueSymbolTable(
 
   for (const ValueName &Name : VST) {
     // Figure out the encoding to use for the name.
-    StringEncoding Bits =
-        getStringEncoding(Name.getKeyData(), Name.getKeyLength());
+    StringEncoding Bits = getStringEncoding(Name.getKey());
 
     unsigned AbbrevToUse = VST_ENTRY_8_ABBREV;
     NameVals.push_back(VE.getValueID(Name.getValue()));
@@ -3149,41 +3162,33 @@ void IndexBitcodeWriter::writeModStrings() {
   unsigned AbbrevHash = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<unsigned, 64> Vals;
-  for (const auto &MPSE : Index.modulePaths()) {
-    if (!doIncludeModule(MPSE.getKey()))
-      continue;
-    StringEncoding Bits =
-        getStringEncoding(MPSE.getKey().data(), MPSE.getKey().size());
-    unsigned AbbrevToUse = Abbrev8Bit;
-    if (Bits == SE_Char6)
-      AbbrevToUse = Abbrev6Bit;
-    else if (Bits == SE_Fixed7)
-      AbbrevToUse = Abbrev7Bit;
-
-    Vals.push_back(MPSE.getValue().first);
-
-    for (const auto P : MPSE.getKey())
-      Vals.push_back((unsigned char)P);
-
-    // Emit the finished record.
-    Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse);
-
-    Vals.clear();
-    // Emit an optional hash for the module now
-    auto &Hash = MPSE.getValue().second;
-    bool AllZero = true; // Detect if the hash is empty, and do not generate it
-    for (auto Val : Hash) {
-      if (Val)
-        AllZero = false;
-      Vals.push_back(Val);
-    }
-    if (!AllZero) {
-      // Emit the hash record.
-      Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash);
-    }
+  forEachModule(
+      [&](const StringMapEntry<std::pair<uint64_t, ModuleHash>> &MPSE) {
+        StringRef Key = MPSE.getKey();
+        const auto &Value = MPSE.getValue();
+        StringEncoding Bits = getStringEncoding(Key);
+        unsigned AbbrevToUse = Abbrev8Bit;
+        if (Bits == SE_Char6)
+          AbbrevToUse = Abbrev6Bit;
+        else if (Bits == SE_Fixed7)
+          AbbrevToUse = Abbrev7Bit;
+
+        Vals.push_back(Value.first);
+        Vals.append(Key.begin(), Key.end());
+
+        // Emit the finished record.
+        Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse);
+
+        // Emit an optional hash for the module now
+        const auto &Hash = Value.second;
+        if (llvm::any_of(Hash, [](uint32_t H) { return H; })) {
+          Vals.assign(Hash.begin(), Hash.end());
+          // Emit the hash record.
+          Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash);
+        }
 
-    Vals.clear();
-  }
+        Vals.clear();
+      });
   Stream.ExitBlock();
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 20e1467b30c3..c2ad9db81cfd 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -194,6 +194,10 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
         // some variables.
         for (const MachineOperand &MO : MI.operands()) {
           if (MO.isReg() && MO.isDef() && MO.getReg()) {
+            // Ignore call instructions that claim to clobber SP. The AArch64
+            // backend does this for aggregate function arguments.
+            if (MI.isCall() && MO.getReg() == SP)
+              continue;
             // If this is a virtual register, only clobber it since it doesn't
             // have aliases.
             if (TRI->isVirtualRegister(MO.getReg()))
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 4d30c6574b12..256a0c95d365 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -77,6 +77,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializePostRASchedulerPass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeProcessImplicitDefsPass(Registry);
+  initializeRABasicPass(Registry);
   initializeRAGreedyPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 23812a2a2344..3603f9b7ed93 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -556,6 +556,10 @@ bool GlobalMerge::doInitialization(Module &M) {
     if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection())
       continue;
 
+    // It's not safe to merge globals that may be preempted
+    if (TM && !TM->shouldAssumeDSOLocal(M, &GV))
+      continue;
+
     if (!(MergeExternalGlobals && GV.hasExternalLinkage()) &&
         !GV.hasInternalLinkage())
       continue;
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index 0dc1079b2ad4..cde6ccd29dfd 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -198,13 +198,12 @@ void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
 }
 
 void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
   if (!MBB.succ_empty()) {
-    const MachineFunction &MF = *MBB.getParent();
     addPristines(*this, MF);
     addLiveOutsNoPristines(MBB);
   } else if (MBB.isReturnBlock()) {
     // For the return block: Add all callee saved registers.
-    const MachineFunction &MF = *MBB.getParent();
     const MachineFrameInfo &MFI = MF.getFrameInfo();
     if (MFI.isCalleeSavedInfoValid())
       addCalleeSavedRegs(*this, MF);
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
index dff555f49565..3746b74e0528 100644
--- a/lib/CodeGen/LiveRegUnits.cpp
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -12,11 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveRegUnits.h"
+
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -81,46 +83,50 @@ void LiveRegUnits::accumulateBackward(const MachineInstr &MI) {
 }
 
 /// Add live-in registers of basic block \p MBB to \p LiveUnits.
-static void addLiveIns(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) {
+static void addBlockLiveIns(LiveRegUnits &LiveUnits,
+                            const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins())
     LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask);
 }
 
-static void addLiveOuts(LiveRegUnits &LiveUnits, const MachineBasicBlock &MBB) {
-  // To get the live-outs we simply merge the live-ins of all successors.
-  for (const MachineBasicBlock *Succ : MBB.successors())
-    addLiveIns(LiveUnits, *Succ);
+/// Adds all callee saved registers to \p LiveUnits.
+static void addCalleeSavedRegs(LiveRegUnits &LiveUnits,
+                               const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR)
+    LiveUnits.addReg(*CSR);
 }
 
-/// Add pristine registers to the given \p LiveUnits. This function removes
-/// actually saved callee save registers when \p InPrologueEpilogue is false.
-static void removeSavedRegs(LiveRegUnits &LiveUnits, const MachineFunction &MF,
-                            const MachineFrameInfo &MFI,
-                            const TargetRegisterInfo &TRI) {
+/// Adds pristine registers to the given \p LiveUnits. Pristine registers are
+/// callee saved registers that are unused in the function.
+static void addPristines(LiveRegUnits &LiveUnits, const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+  /// Add all callee saved regs, then remove the ones that are saved+restored.
+  addCalleeSavedRegs(LiveUnits, MF);
+  /// Remove the ones that are not saved/restored; they are pristine.
   for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
     LiveUnits.removeReg(Info.getReg());
 }
 
 void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.isCalleeSavedInfoValid()) {
-    for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
-      addReg(*I);
-    if (!MBB.isReturnBlock())
-      removeSavedRegs(*this, MF, MFI, *TRI);
+  if (!MBB.succ_empty()) {
+    addPristines(*this, MF);
+    // To get the live-outs we simply merge the live-ins of all successors.
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      addBlockLiveIns(*this, *Succ);
+  } else if (MBB.isReturnBlock()) {
+    // For the return block: Add all callee saved registers.
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid())
+      addCalleeSavedRegs(*this, MF);
   }
-  ::addLiveOuts(*this, MBB);
 }
 
 void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.isCalleeSavedInfoValid()) {
-    for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
-      addReg(*I);
-    if (&MBB != &MF.front())
-      removeSavedRegs(*this, MF, MFI, *TRI);
-  }
-  ::addLiveIns(*this, MBB);
+  addPristines(*this, MF);
+  addBlockLiveIns(*this, MBB);
 }
diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index 71ad4e6aa7f5..2402ffdbbcb1 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp
@@ -1,7 +1,19 @@
-#include "llvm/CodeGen/MachineRegionInfo.h"
+//===- lib/Codegen/MachineRegionInfo.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "machine-region-info"
 
@@ -11,36 +23,29 @@ STATISTIC(numMachineRegions,       "The # of machine regions");
 STATISTIC(numMachineSimpleRegions, "The # of simple machine regions");
 
 namespace llvm {
+
 template class RegionBase<RegionTraits<MachineFunction>>;
 template class RegionNodeBase<RegionTraits<MachineFunction>>;
 template class RegionInfoBase<RegionTraits<MachineFunction>>;
-}
+
+} // end namespace llvm
 
 //===----------------------------------------------------------------------===//
 // MachineRegion implementation
-//
 
 MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
                              MachineRegionInfo* RI,
                              MachineDominatorTree *DT, MachineRegion *Parent) :
-  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {
+  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {}
 
-}
-
-MachineRegion::~MachineRegion() { }
+MachineRegion::~MachineRegion() = default;
 
 //===----------------------------------------------------------------------===//
 // MachineRegionInfo implementation
-//
-
-MachineRegionInfo::MachineRegionInfo() :
-  RegionInfoBase<RegionTraits<MachineFunction>>() {
-
-}
 
-MachineRegionInfo::~MachineRegionInfo() {
+MachineRegionInfo::MachineRegionInfo() = default;
 
-}
+MachineRegionInfo::~MachineRegionInfo() = default;
 
 void MachineRegionInfo::updateStatistics(MachineRegion *R) {
   ++numMachineRegions;
@@ -73,9 +78,7 @@ MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) {
   initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry());
 }
 
-MachineRegionInfoPass::~MachineRegionInfoPass() {
-
-}
+MachineRegionInfoPass::~MachineRegionInfoPass() = default;
 
 bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
   releaseMemory();
@@ -137,8 +140,9 @@ INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE,
 // the link time optimization.
 
 namespace llvm {
-  FunctionPass *createMachineRegionInfoPass() {
-    return new MachineRegionInfoPass();
-  }
+
+FunctionPass *createMachineRegionInfoPass() {
+  return new MachineRegionInfoPass();
 }
 
+} // end namespace llvm
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 265f93c363ca..f6dbf667cf02 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -36,6 +36,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -909,17 +910,43 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
-  // Generic loads and stores must have a single MachineMemOperand
-  // describing that access.
-  if ((MI->getOpcode() == TargetOpcode::G_LOAD ||
-       MI->getOpcode() == TargetOpcode::G_STORE) &&
-      !MI->hasOneMemOperand())
-    report("Generic instruction accessing memory must have one mem operand",
-           MI);
-
   StringRef ErrorInfo;
   if (!TII->verifyInstruction(*MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
+
+  // Verify properties of various specific instruction types
+  switch(MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE:
+    // Generic loads and stores must have a single MachineMemOperand
+    // describing that access.
+    if (!MI->hasOneMemOperand())
+      report("Generic instruction accessing memory must have one mem operand",
+             MI);
+    break;
+  case TargetOpcode::STATEPOINT:
+    if (!MI->getOperand(StatepointOpers::IDPos).isImm() ||
+        !MI->getOperand(StatepointOpers::NBytesPos).isImm() ||
+        !MI->getOperand(StatepointOpers::NCallArgsPos).isImm())
+      report("meta operands to STATEPOINT not constant!", MI);
+    break;
+
+    auto VerifyStackMapConstant = [&](unsigned Offset) {
+      if (!MI->getOperand(Offset).isImm() ||
+          MI->getOperand(Offset).getImm() != StackMaps::ConstantOp || 
+          !MI->getOperand(Offset + 1).isImm()) 
+        report("stack map constant to STATEPOINT not well formed!", MI);
+    };
+    const unsigned VarStart = StatepointOpers(MI).getVarIdx();
+    VerifyStackMapConstant(VarStart + StatepointOpers::CCOffset);
+    VerifyStackMapConstant(VarStart + StatepointOpers::FlagsOffset);
+    VerifyStackMapConstant(VarStart + StatepointOpers::NumDeoptOperandsOffset);
+
+    // TODO: verify we have properly encoded deopt arguments
+   
+  };
 }
 
 void
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index a9813e534c5f..e9f8d43fe643 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -54,8 +54,6 @@ static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
                                    const MBBVector &SaveBlocks,
                                    const MBBVector &RestoreBlocks);
 
-static void doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS);
-
 namespace {
 class PEI : public MachineFunctionPass {
 public:
@@ -84,7 +82,7 @@ private:
                      const MBBVector &SaveBlocks,
                      const MBBVector &RestoreBlocks)>
       SpillCalleeSavedRegisters;
-  std::function<void(MachineFunction &MF, RegScavenger *RS)>
+  std::function<void(MachineFunction &MF, RegScavenger &RS)>
       ScavengeFrameVirtualRegs;
 
   bool UsesCalleeSaves = false;
@@ -142,7 +140,6 @@ MachineFunctionPass *llvm::createPrologEpilogInserterPass() {
   return new PEI();
 }
 
-STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
           "Number of bytes used for stack in all functions");
 
@@ -168,10 +165,10 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
       SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
                                      unsigned &, unsigned &, const MBBVector &,
                                      const MBBVector &) {};
-      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {};
+      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger &) {};
     } else {
       SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
-      ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs;
+      ScavengeFrameVirtualRegs = scavengeFrameVirtualRegs;
       UsesCalleeSaves = true;
     }
   }
@@ -222,7 +219,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
   if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
-      ScavengeFrameVirtualRegs(Fn, RS);
+      ScavengeFrameVirtualRegs(Fn, *RS);
 
       // Clear any vregs created by virtual scavenging.
       Fn.getRegInfo().clearVirtRegs();
@@ -1153,92 +1150,3 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       RS->forward(MI);
   }
 }
-
-/// doScavengeFrameVirtualRegs - Replace all frame index virtual registers
-/// with physical registers. Use the register scavenger to find an
-/// appropriate register to use.
-///
-/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
-/// iterate over the vreg use list, which at this point only contains machine
-/// operands for which eliminateFrameIndex need a new scratch reg.
-static void
-doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) {
-  // Run through the instructions and find any virtual registers.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (MachineBasicBlock &MBB : MF) {
-    RS->enterBasicBlock(MBB);
-
-    int SPAdj = 0;
-
-    // The instruction stream may change in the loop, so check MBB.end()
-    // directly.
-    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
-      // We might end up here again with a NULL iterator if we scavenged a
-      // register for which we inserted spill code for definition by what was
-      // originally the first instruction in MBB.
-      if (I == MachineBasicBlock::iterator(nullptr))
-        I = MBB.begin();
-
-      const MachineInstr &MI = *I;
-      MachineBasicBlock::iterator J = std::next(I);
-      MachineBasicBlock::iterator P =
-                         I == MBB.begin() ? MachineBasicBlock::iterator(nullptr)
-                                          : std::prev(I);
-
-      // RS should process this instruction before we might scavenge at this
-      // location. This is because we might be replacing a virtual register
-      // defined by this instruction, and if so, registers killed by this
-      // instruction are available, and defined registers are not.
-      RS->forward(I);
-
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg())
-          continue;
-        unsigned Reg = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
-          continue;
-
-        // When we first encounter a new virtual register, it
-        // must be a definition.
-        assert(MO.isDef() && "frame index virtual missing def!");
-        // Scavenge a new scratch register
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
-
-        ++NumScavengedRegs;
-
-        // Replace this reference to the virtual register with the
-        // scratch register.
-        assert(ScratchReg && "Missing scratch register!");
-        MRI.replaceRegWith(Reg, ScratchReg);
-
-        // Because this instruction was processed by the RS before this
-        // register was allocated, make sure that the RS now records the
-        // register as being used.
-        RS->setRegUsed(ScratchReg);
-      }
-
-      // If the scavenger needed to use one of its spill slots, the
-      // spill code will have been inserted in between I and J. This is a
-      // problem because we need the spill code before I: Move I to just
-      // prior to J.
-      if (I != std::prev(J)) {
-        MBB.splice(J, &MBB, I);
-
-        // Before we move I, we need to prepare the RS to visit I again.
-        // Specifically, RS will assert if it sees uses of registers that
-        // it believes are undefined. Because we have already processed
-        // register kills in I, when it visits I again, it will believe that
-        // those registers are undefined. To avoid this situation, unprocess
-        // the instruction I.
-        assert(RS->getCurrentPosition() == I &&
-          "The register scavenger has an unexpected position");
-        I = P;
-        RS->unprocess(P);
-      } else
-        ++I;
-    }
-  }
-
-  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
-}
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index a87fed3a687e..24be7ea98d82 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -58,8 +58,9 @@ namespace {
 /// whenever a register is unavailable. This is not practical in production but
 /// provides a useful baseline both for measuring other allocators and comparing
 /// the speed of the basic algorithm against other styles of allocators.
-class RABasic : public MachineFunctionPass, public RegAllocBase
-{
+class RABasic : public MachineFunctionPass,
+                public RegAllocBase,
+                private LiveRangeEdit::Delegate {
   // context
   MachineFunction *MF;
 
@@ -72,6 +73,9 @@ class RABasic : public MachineFunctionPass, public RegAllocBase
   // selectOrSplit().
   BitVector UsableRegs;
 
+  bool LRE_CanEraseVirtReg(unsigned) override;
+  void LRE_WillShrinkVirtReg(unsigned) override;
+
 public:
   RABasic();
 
@@ -121,17 +125,46 @@ char RABasic::ID = 0;
 
 } // end anonymous namespace
 
+char &llvm::RABasicID = RABasic::ID;
+
+INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
+INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
+                    false)
+
+bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) {
+  if (VRM->hasPhys(VirtReg)) {
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
+    return true;
+  }
+  // Unassigned virtreg is probably in the priority queue.
+  // RegAllocBase will erase it after dequeueing.
+  return false;
+}
+
+void RABasic::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+  if (!VRM->hasPhys(VirtReg))
+    return;
+
+  // Register is assigned, put it back on the queue for reassignment.
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  Matrix->unassign(LI);
+  enqueue(&LI);
+}
+
 RABasic::RABasic(): MachineFunctionPass(ID) {
-  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
-  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
-  initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
-  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeLiveStacksPass(*PassRegistry::getPassRegistry());
-  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
-  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
-  initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
-  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
 }
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -200,7 +233,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
     Matrix->unassign(Spill);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
     spiller().spill(LRE);
   }
   return true;
@@ -259,7 +292,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 3b5964eef55e..b2dfef91add5 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -49,9 +49,11 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/Math.h"
 #include "llvm/CodeGen/PBQP/Solution.h"
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/RegAllocPBQP.h"
@@ -139,13 +141,13 @@ public:
   }
 
 private:
-  typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
-  typedef std::vector<const LiveInterval*> Node2LIMap;
-  typedef std::vector<unsigned> AllowedSet;
-  typedef std::vector<AllowedSet> AllowedSetMap;
-  typedef std::pair<unsigned, unsigned> RegPair;
-  typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
-  typedef std::set<unsigned> RegSet;
+  using LI2NodeMap = std::map<const LiveInterval *, unsigned>;
+  using Node2LIMap = std::vector<const LiveInterval *>;
+  using AllowedSet = std::vector<unsigned>;
+  using AllowedSetMap = std::vector<AllowedSet>;
+  using RegPair = std::pair<unsigned, unsigned>;
+  using CoalesceMap = std::map<RegPair, PBQP::PBQPNum>;
+  using RegSet = std::set<unsigned>;
 
   char *customPassID;
 
@@ -212,12 +214,12 @@ public:
 /// @brief Add interference edges between overlapping vregs.
 class Interference : public PBQPRAConstraint {
 private:
-  typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
-  typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey;
-  typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
-  typedef DenseSet<IKey> DisjointAllowedRegsCache;
-  typedef std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId> IEdgeKey;
-  typedef DenseSet<IEdgeKey> IEdgeCache;
+  using AllowedRegVecPtr = const PBQP::RegAlloc::AllowedRegVector *;
+  using IKey = std::pair<AllowedRegVecPtr, AllowedRegVecPtr>;
+  using IMatrixCache = DenseMap<IKey, PBQPRAGraph::MatrixPtr>;
+  using DisjointAllowedRegsCache = DenseSet<IKey>;
+  using IEdgeKey = std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId>;
+  using IEdgeCache = DenseSet<IEdgeKey>;
 
   bool haveDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
                                PBQPRAGraph::NodeId MId,
@@ -252,8 +254,8 @@ private:
   // for the fast interference graph construction algorithm. The last is there
   // to save us from looking up node ids via the VRegToNode map in the graph
   // metadata.
-  typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>
-    IntervalInfo;
+  using IntervalInfo =
+      std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>;
 
   static SlotIndex getStartPoint(const IntervalInfo &I) {
     return std::get<0>(I)->segments[std::get<1>(I)].start;
@@ -320,9 +322,10 @@ public:
     // Cache known disjoint allowed registers pairs
     DisjointAllowedRegsCache D;
 
-    typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet;
-    typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
-                                decltype(&lowestStartPoint)> IntervalQueue;
+    using IntervalSet = std::set<IntervalInfo, decltype(&lowestEndPoint)>;
+    using IntervalQueue =
+        std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
+                            decltype(&lowestStartPoint)>;
     IntervalSet Active(lowestEndPoint);
     IntervalQueue Inactive(lowestStartPoint);
 
@@ -658,7 +661,6 @@ void RegAllocPBQP::spillVReg(unsigned VReg,
                              SmallVectorImpl<unsigned> &NewIntervals,
                              MachineFunction &MF, LiveIntervals &LIS,
                              VirtRegMap &VRM, Spiller &VRegSpiller) {
-
   VRegsToAlloc.erase(VReg);
   LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
                     nullptr, &DeadRemats);
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 0635e5c0a63c..1aed58c36e17 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -15,18 +15,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/RegisterScavenging.h"
+
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/PassSupport.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -39,6 +44,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
 
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+
 void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
   LiveUnits.addRegMasked(Reg, LaneMask);
 }
@@ -469,3 +476,120 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
 
   return SReg;
 }
+
+void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) {
+  // FIXME: Iterating over the instruction stream is unnecessary. We can simply
+  // iterate over the vreg use list, which at this point only contains machine
+  // operands for which eliminateFrameIndex need a new scratch reg.
+
+  // Run through the instructions and find any virtual registers.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    RS.enterBasicBlock(MBB);
+
+    int SPAdj = 0;
+
+    // The instruction stream may change in the loop, so check MBB.end()
+    // directly.
+    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+      // We might end up here again with a NULL iterator if we scavenged a
+      // register for which we inserted spill code for definition by what was
+      // originally the first instruction in MBB.
+      if (I == MachineBasicBlock::iterator(nullptr))
+        I = MBB.begin();
+
+      const MachineInstr &MI = *I;
+      MachineBasicBlock::iterator J = std::next(I);
+      MachineBasicBlock::iterator P =
+                         I == MBB.begin() ? MachineBasicBlock::iterator(nullptr)
+                                          : std::prev(I);
+
+      // RS should process this instruction before we might scavenge at this
+      // location. This is because we might be replacing a virtual register
+      // defined by this instruction, and if so, registers killed by this
+      // instruction are available, and defined registers are not.
+      RS.forward(I);
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          continue;
+
+        // When we first encounter a new virtual register, it
+        // must be a definition.
+        assert(MO.isDef() && "frame index virtual missing def!");
+        // Scavenge a new scratch register
+        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+        unsigned ScratchReg = RS.scavengeRegister(RC, J, SPAdj);
+
+        ++NumScavengedRegs;
+
+        // Replace this reference to the virtual register with the
+        // scratch register.
+        assert(ScratchReg && "Missing scratch register!");
+        MRI.replaceRegWith(Reg, ScratchReg);
+
+        // Because this instruction was processed by the RS before this
+        // register was allocated, make sure that the RS now records the
+        // register as being used.
+        RS.setRegUsed(ScratchReg);
+      }
+
+      // If the scavenger needed to use one of its spill slots, the
+      // spill code will have been inserted in between I and J. This is a
+      // problem because we need the spill code before I: Move I to just
+      // prior to J.
+      if (I != std::prev(J)) {
+        MBB.splice(J, &MBB, I);
+
+        // Before we move I, we need to prepare the RS to visit I again.
+        // Specifically, RS will assert if it sees uses of registers that
+        // it believes are undefined. Because we have already processed
+        // register kills in I, when it visits I again, it will believe that
+        // those registers are undefined. To avoid this situation, unprocess
+        // the instruction I.
+        assert(RS.getCurrentPosition() == I &&
+          "The register scavenger has an unexpected position");
+        I = P;
+        RS.unprocess(P);
+      } else
+        ++I;
+    }
+  }
+
+  MRI.clearVirtRegs();
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+}
+
+namespace {
+/// This class runs register scavenging independ of the PrologEpilogInserter.
+/// This is used in for testing.
+class ScavengerTest : public MachineFunctionPass {
+public:
+  static char ID;
+  ScavengerTest() : MachineFunctionPass(ID) {}
+  bool runOnMachineFunction(MachineFunction &MF) {
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    const TargetFrameLowering &TFL = *STI.getFrameLowering();
+
+    RegScavenger RS;
+    // Let's hope that calling those outside of PrologEpilogueInserter works
+    // well enough to initialize the scavenger with some emergency spillslots
+    // for the target.
+    BitVector SavedRegs;
+    TFL.determineCalleeSaves(MF, SavedRegs, &RS);
+    TFL.processFunctionBeforeFrameFinalized(MF, &RS);
+
+    // Let's scavenge the current function
+    scavengeFrameVirtualRegs(MF, RS);
+    return true;
+  }
+};
+char ScavengerTest::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(ScavengerTest, "scavenger-test",
+                "Scavenge virtual registers inside basic blocks", false, false)
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 8035ea80364b..3fdbd2459361 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -12,32 +12,54 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -90,11 +112,9 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
-      RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr),
+      RemoveKillFlags(RemoveKillFlags),
       UnknownValue(UndefValue::get(
-                     Type::getVoidTy(mf.getFunction()->getContext()))),
-      FirstDbgValue(nullptr) {
+                             Type::getVoidTy(mf.getFunction()->getContext()))) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
@@ -126,7 +146,7 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) {
       return V;
     }
     assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
-  } while (1);
+  } while (true);
 }
 
 /// This is a wrapper around GetUnderlyingObjects and adds support for basic
@@ -563,7 +583,7 @@ void ScheduleDAGInstrs::initSUnits() {
   // which is contained within a basic block.
   SUnits.reserve(NumRegionInstrs);
 
-  for (MachineInstr &MI : llvm::make_range(RegionBegin, RegionEnd)) {
+  for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
     if (MI.isDebugValue())
       continue;
 
@@ -606,13 +626,13 @@ void ScheduleDAGInstrs::initSUnits() {
 
 class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
   /// Current total number of SUs in map.
-  unsigned NumNodes;
+  unsigned NumNodes = 0;
 
   /// 1 for loads, 0 for stores. (see comment in SUList)
   unsigned TrueMemOrderLatency;
 
 public:
-  Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}
+  Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}
 
   /// To keep NumNodes up to date, insert() is used instead of
   /// this operator w/ push_back().
@@ -630,7 +650,7 @@ public:
   void inline clearList(ValueType V) {
     iterator Itr = find(V);
     if (Itr != end()) {
-      assert (NumNodes >= Itr->second.size());
+      assert(NumNodes >= Itr->second.size());
       NumNodes -= Itr->second.size();
 
       Itr->second.clear();
@@ -646,7 +666,7 @@ public:
   unsigned inline size() const { return NumNodes; }
 
   /// Counts the number of SUs in this map after a reduction.
-  void reComputeSize(void) {
+  void reComputeSize() {
     NumNodes = 0;
     for (auto &I : *this)
       NumNodes += I.second.size();
@@ -676,7 +696,7 @@ void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
 }
 
 void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
-  assert (BarrierChain != nullptr);
+  assert(BarrierChain != nullptr);
 
   for (auto &I : map) {
     SUList &sus = I.second;
@@ -687,7 +707,7 @@ void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
 }
 
 void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
-  assert (BarrierChain != nullptr);
+  assert(BarrierChain != nullptr);
 
   // Go through all lists of SUs.
   for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
@@ -1028,7 +1048,7 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
   // The N last elements in NodeNums will be removed, and the SU with
   // the lowest NodeNum of them will become the new BarrierChain to
   // let the not yet seen SUs have a dependency to the removed SUs.
-  assert (N <= NodeNums.size());
+  assert(N <= NodeNums.size());
   SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
   if (BarrierChain) {
     // The aliasing and non-aliasing maps reduce independently of each
@@ -1156,6 +1176,7 @@ std::string ScheduleDAGInstrs::getDAGName() const {
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
+
 /// Internal state used to compute SchedDFSResult.
 class SchedDFSImpl {
   SchedDFSResult &R;
@@ -1163,16 +1184,16 @@ class SchedDFSImpl {
   /// Join DAG nodes into equivalence classes by their subtree.
   IntEqClasses SubtreeClasses;
   /// List PredSU, SuccSU pairs that represent data edges between subtrees.
-  std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs;
+  std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs;
 
   struct RootData {
     unsigned NodeID;
     unsigned ParentNodeID;  ///< Parent node (member of the parent subtree).
-    unsigned SubInstrCount; ///< Instr count in this tree only, not children.
+    unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
+                                /// children.
 
     RootData(unsigned id): NodeID(id),
-                           ParentNodeID(SchedDFSResult::InvalidSubtreeID),
-                           SubInstrCount(0) {}
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
 
     unsigned getSparseSetIndex() const { return NodeID; }
   };
@@ -1340,12 +1361,15 @@ protected:
     } while (FromTree != SchedDFSResult::InvalidSubtreeID);
   }
 };
+
 } // end namespace llvm
 
 namespace {
+
 /// Manage the stack used by a reverse depth-first search over the DAG.
 class SchedDAGReverseDFS {
-  std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;
+  std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
+
 public:
   bool isComplete() const { return DFSStack.empty(); }
 
@@ -1367,7 +1391,8 @@ public:
     return getCurr()->Preds.end();
   }
 };
-} // anonymous
+
+} // end anonymous namespace
 
 static bool hasDataSucc(const SUnit *SU) {
   for (const SDep &SuccDep : SU->Succs) {
@@ -1392,7 +1417,7 @@ void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
     SchedDAGReverseDFS DFS;
     Impl.visitPreorder(&SU);
     DFS.follow(&SU);
-    for (;;) {
+    while (true) {
       // Traverse the leftmost path as far as possible.
       while (DFS.getPred() != DFS.getPredEnd()) {
         const SDep &PredDep = *DFS.getPred();
@@ -1457,4 +1482,5 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
 }
 
 } // end namespace llvm
+
 #endif
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 5f167f8de1cf..9355dbe77f94 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -225,6 +225,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
           }
           return TranslateLegalizeResults(Op, Lowered);
         }
+        LLVM_FALLTHROUGH;
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandLoad(Op));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 177898e1e950..80a03ea4eea0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1,4 +1,4 @@
-//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===//
+//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,29 +11,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeDbgValue.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -41,16 +58,20 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
-#include <cmath>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <set>
+#include <string>
 #include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -269,7 +290,6 @@ ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
   return ISD::CondCode(Operation);
 }
 
-
 /// For an integer comparison, return 1 if the comparison is a signed operation
 /// and 2 if the result is an unsigned comparison. Return zero if the operation
 /// does not depend on the sign of the input (setne and seteq).
@@ -338,7 +358,6 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
 //===----------------------------------------------------------------------===//
 
 /// AddNodeIDOpcode - Add the node opcode to the NodeID data.
-///
 static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
   ID.AddInteger(OpC);
 }
@@ -350,7 +369,6 @@ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
-///
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDValue> Ops) {
   for (auto& Op : Ops) {
@@ -360,7 +378,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
-///
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDUse> Ops) {
   for (auto& Op : Ops) {
@@ -392,10 +409,9 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     break;
   }
   case ISD::TargetConstantFP:
-  case ISD::ConstantFP: {
+  case ISD::ConstantFP:
     ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
     break;
-  }
   case ISD::TargetGlobalAddress:
   case ISD::GlobalAddress:
   case ISD::TargetGlobalTLSAddress:
@@ -770,7 +786,6 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
 /// maps and modified in place. Add it back to the CSE maps, unless an identical
 /// node already exists, in which case transfer all its users to the existing
 /// node. This transfer can potentially trigger recursive merging.
-///
 void
 SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
   // For node types that aren't CSE'd, just act as if no identical node
@@ -835,7 +850,6 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   return Node;
 }
 
-
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
 /// were replaced with those specified.  If this node is never memoized,
 /// return null, otherwise return a pointer to the slot it would take.  If a
@@ -864,10 +878,9 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-    : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL),
+    : TM(tm), OptLevel(OL),
       EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-      Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
-      UpdateListeners(nullptr) {
+      Root(getEntryNode()) {
   InsertNode(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
@@ -1038,7 +1051,6 @@ SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
 }
 
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
-///
 SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue NegOne =
@@ -1317,7 +1329,6 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   return SDValue(N, 0);
 }
 
-
 SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
@@ -1451,7 +1462,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
   // Validate that all indices in Mask are within the range of the elements
   // input to the shuffle.
   int NElts = Mask.size();
-  assert(all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
+  assert(llvm::all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
          "Index out of range");
 
   // Copy the mask so we can do any needed cleanup.
@@ -2918,7 +2929,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       else
         DemandedRHS.setBit((unsigned)M % NumElts);
     }
-    Tmp = UINT_MAX;
+    Tmp = std::numeric_limits<unsigned>::max();
     if (!!DemandedLHS)
       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
     if (!!DemandedRHS) {
@@ -3122,7 +3133,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       unsigned EltIdx = CEltNo->getZExtValue();
 
       // If we demand the inserted element then get its sign bits.
-      Tmp = UINT_MAX;
+      Tmp = std::numeric_limits<unsigned>::max();
       if (DemandedElts[EltIdx]) {
         // TODO - handle implicit truncation of inserted elements.
         if (InVal.getScalarValueSizeInBits() != VTBits)
@@ -3188,7 +3199,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::CONCAT_VECTORS:
     // Determine the minimum number of sign bits across all demanded
     // elts of the input vectors. Early out if the result is already 1.
-    Tmp = UINT_MAX;
+    Tmp = std::numeric_limits<unsigned>::max();
     EVT SubVectorVT = Op.getOperand(0).getValueType();
     unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
     unsigned NumSubVectors = Op.getNumOperands();
@@ -3327,7 +3338,7 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
 
 static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
                                   ArrayRef<SDValue> Ops,
-                                  llvm::SelectionDAG &DAG) {
+                                  SelectionDAG &DAG) {
   assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
   assert(llvm::all_of(Ops,
                       [Ops](SDValue Op) {
@@ -3836,8 +3847,9 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
       return true;
 
     return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
-           any_of(Divisor->op_values(),
-                  [](SDValue V) { return V.isUndef() || isNullConstant(V); });
+           llvm::any_of(Divisor->op_values(),
+                        [](SDValue V) { return V.isUndef() ||
+                                        isNullConstant(V); });
     // TODO: Handle signed overflow.
   }
   // TODO: Handle oversized shifts.
@@ -3948,8 +3960,8 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   // All operands must be vector types with the same number of elements as
   // the result type and must be either UNDEF or a build vector of constant
   // or UNDEF scalars.
-  if (!all_of(Ops, IsConstantBuildVectorOrUndef) ||
-      !all_of(Ops, IsScalarOrSameVectorSize))
+  if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) ||
+      !llvm::all_of(Ops, IsScalarOrSameVectorSize))
     return SDValue();
 
   // If we are comparing vectors, then the result needs to be a i1 boolean
@@ -5550,7 +5562,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
           Opcode == ISD::PREFETCH ||
           Opcode == ISD::LIFETIME_START ||
           Opcode == ISD::LIFETIME_END ||
-          (Opcode <= INT_MAX &&
+          ((int)Opcode <= std::numeric_limits<int>::max() &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
 
@@ -5884,7 +5896,6 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                                     SDValue Ptr, SDValue Mask, SDValue Src0,
                                     EVT MemVT, MachineMemOperand *MMO,
                                     ISD::LoadExtType ExtTy, bool isExpanding) {
-
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
   FoldingSetNodeID ID;
@@ -6038,13 +6049,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   switch (Opcode) {
   default: break;
-  case ISD::CONCAT_VECTORS: {
+  case ISD::CONCAT_VECTORS:
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
       return V;
     break;
-  }
-  case ISD::SELECT_CC: {
+  case ISD::SELECT_CC:
     assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
     assert(Ops[0].getValueType() == Ops[1].getValueType() &&
            "LHS and RHS of condition must have same type!");
@@ -6053,14 +6063,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Ops[2].getValueType() == VT &&
            "select_cc node must be of same type as true and false value!");
     break;
-  }
-  case ISD::BR_CC: {
+  case ISD::BR_CC:
     assert(NumOps == 5 && "BR_CC takes 5 operands!");
     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
            "LHS/RHS of comparison should match types!");
     break;
   }
-  }
 
   // Memoize nodes.
   SDNode *N;
@@ -6599,7 +6607,6 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
   return Res; 
 }
 
-
 /// getMachineNode - These are used for target selectors to create a new node
 /// with specified return type(s), MachineInstr opcode, and operands.
 ///
@@ -6812,7 +6819,7 @@ public:
     : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
 };
 
-}
+} // end anonymous namespace
 
 /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
 /// This can cause recursive merging of nodes in the DAG.
@@ -6858,7 +6865,6 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
     AddModifiedNodeToCSEMaps(User);
   }
 
-
   // If we just RAUW'd the root, take note.
   if (FromN == getRoot())
     setRoot(To);
@@ -7028,6 +7034,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
 }
 
 namespace {
+
   /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
   /// to record information about a use.
   struct UseMemo {
@@ -7040,7 +7047,8 @@ namespace {
   bool operator<(const UseMemo &L, const UseMemo &R) {
     return (intptr_t)L.User < (intptr_t)R.User;
   }
-}
+
+} // end anonymous namespace
 
 /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The same value
@@ -7106,7 +7114,6 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
 /// based on their topological order. It returns the maximum id and a vector
 /// of the SDNodes* in assigned order by reference.
 unsigned SelectionDAG::AssignTopologicalOrder() {
-
   unsigned DAGSize = 0;
 
   // SortedPos tracks the progress of the algorithm. Nodes before it are
@@ -7333,6 +7340,7 @@ void SDNode::Profile(FoldingSetNodeID &ID) const {
 }
 
 namespace {
+
   struct EVTArray {
     std::vector<EVT> VTs;
 
@@ -7342,11 +7350,12 @@ namespace {
         VTs.push_back(MVT((MVT::SimpleValueType)i));
     }
   };
-}
 
-static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs;
+} // end anonymous namespace
+
+static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
 static ManagedStatic<EVTArray> SimpleVTArray;
-static ManagedStatic<sys::SmartMutex<true> > VTMutex;
+static ManagedStatic<sys::SmartMutex<true>> VTMutex;
 
 /// getValueTypeList - Return a pointer to the specified value type.
 ///
@@ -7380,7 +7389,6 @@ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
   return NUses == 0;
 }
 
-
 /// hasAnyUseOfValue - Return true if there are any use of the indicated
 /// value. This method ignores uses of other values defined by this operation.
 bool SDNode::hasAnyUseOfValue(unsigned Value) const {
@@ -7393,9 +7401,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const {
   return false;
 }
 
-
 /// isOnlyUserOf - Return true if this node is the only use of N.
-///
 bool SDNode::isOnlyUserOf(const SDNode *N) const {
   bool Seen = false;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
@@ -7425,7 +7431,6 @@ bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
 }
 
 /// isOperand - Return true if this node is an operand of N.
-///
 bool SDValue::isOperandOf(const SDNode *N) const {
   for (const SDValue &Op : N->op_values())
     if (*this == Op)
@@ -7475,7 +7480,7 @@ bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
     }
     // Next, try a deep search: check whether every operand of the TokenFactor
     // reaches Dest.
-    return all_of((*this)->ops(), [=](SDValue Op) {
+    return llvm::all_of((*this)->ops(), [=](SDValue Op) {
       return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
     });
   }
@@ -7627,7 +7632,6 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
   return false;
 }
 
-
 /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
 /// it cannot be inferred.
 unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
@@ -7718,7 +7722,6 @@ unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
 }
 
-
 Type *ConstantPoolSDNode::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 687b882c5e4d..b5ccd64ee76c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2022,7 +2022,7 @@ static SDNode *findGlueUse(SDNode *N) {
 }
 
 /// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
-/// This function recursively traverses up the operand chain, ignoring
+/// This function iteratively traverses up the operand chain, ignoring
 /// certain nodes.
 static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
                           SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
@@ -2035,30 +2035,36 @@ static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
   // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
   // happen because we scan down to newly selected nodes in the case of glue
   // uses.
-  if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1))
-    return false;
+  std::vector<SDNode *> WorkList;
+  WorkList.push_back(Use);
 
-  // Don't revisit nodes if we already scanned it and didn't fail, we know we
-  // won't fail if we scan it again.
-  if (!Visited.insert(Use).second)
-    return false;
+  while (!WorkList.empty()) {
+    Use = WorkList.back();
+    WorkList.pop_back();
+    if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)
+      continue;
 
-  for (const SDValue &Op : Use->op_values()) {
-    // Ignore chain uses, they are validated by HandleMergeInputChains.
-    if (Op.getValueType() == MVT::Other && IgnoreChains)
+    // Don't revisit nodes if we already scanned it and didn't fail, we know we
+    // won't fail if we scan it again.
+    if (!Visited.insert(Use).second)
       continue;
 
-    SDNode *N = Op.getNode();
-    if (N == Def) {
-      if (Use == ImmedUse || Use == Root)
-        continue;  // We are not looking for immediate use.
-      assert(N != Root);
-      return true;
-    }
+    for (const SDValue &Op : Use->op_values()) {
+      // Ignore chain uses, they are validated by HandleMergeInputChains.
+      if (Op.getValueType() == MVT::Other && IgnoreChains)
+        continue;
 
-    // Traverse up the operand chain.
-    if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains))
-      return true;
+      SDNode *N = Op.getNode();
+      if (N == Def) {
+        if (Use == ImmedUse || Use == Root)
+          continue;  // We are not looking for immediate use.
+        assert(N != Root);
+        return true;
+      }
+
+      // Traverse up the operand chain.
+      WorkList.push_back(N);
+    }
   }
   return false;
 }
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 1c66649cae01..eed667dbe7e0 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -818,7 +818,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   SI.GCTransitionArgs =
       ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
   SI.ID = ISP.getID();
-  SI.DeoptState = ArrayRef<const Use>(ISP.vm_state_begin(), ISP.vm_state_end());
+  SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
   SI.StatepointFlags = ISP.getFlags();
   SI.NumPatchBytes = ISP.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0dffffee9976..adb2b188265b 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1493,8 +1493,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
   }
 
-  // Ensure that the constant occurs on the RHS, and fold constant
-  // comparisons.
+  // Ensure that the constant occurs on the RHS and fold constant comparisons.
   ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
   if (isa<ConstantSDNode>(N0.getNode()) &&
       (DCI.isBeforeLegalizeOps() ||
@@ -1638,14 +1637,13 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
                                       TopSetCC.getOperand(1),
                                       InvCond);
-
         }
       }
     }
 
-    // If the LHS is '(and load, const)', the RHS is 0,
-    // the test is for equality or unsigned, and all 1 bits of the const are
-    // in the same partial word, see if we can shorten the load.
+    // If the LHS is '(and load, const)', the RHS is 0, the test is for
+    // equality or unsigned, and all 1 bits of the const are in the same
+    // partial word, see if we can shorten the load.
     if (DCI.isBeforeLegalize() &&
         !ISD::isSignedIntSetCC(Cond) &&
         N0.getOpcode() == ISD::AND && C1 == 0 &&
@@ -1669,10 +1667,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           APInt newMask = APInt::getLowBitsSet(maskWidth, width);
           for (unsigned offset=0; offset<origWidth/width; offset++) {
             if ((newMask & Mask) == Mask) {
-              if (!DAG.getDataLayout().isLittleEndian())
-                bestOffset = (origWidth/width - offset - 1) * (width/8);
-              else
+              if (DAG.getDataLayout().isLittleEndian())
                 bestOffset = (uint64_t)offset * (width/8);
+              else
+                bestOffset = (origWidth/width - offset - 1) * (width/8);
               bestMask = Mask.lshr(offset * (width/8) * 8);
               bestWidth = width;
               break;
@@ -1713,10 +1711,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         switch (Cond) {
         case ISD::SETUGT:
         case ISD::SETUGE:
-        case ISD::SETEQ: return DAG.getConstant(0, dl, VT);
+        case ISD::SETEQ:
+          return DAG.getConstant(0, dl, VT);
         case ISD::SETULT:
         case ISD::SETULE:
-        case ISD::SETNE: return DAG.getConstant(1, dl, VT);
+        case ISD::SETNE:
+          return DAG.getConstant(1, dl, VT);
         case ISD::SETGT:
         case ISD::SETGE:
           // True if the sign bit of C1 is set.
@@ -1816,9 +1816,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                                         BitWidth-1))) {
           // Okay, get the un-inverted input value.
           SDValue Val;
-          if (N0.getOpcode() == ISD::XOR)
+          if (N0.getOpcode() == ISD::XOR) {
             Val = N0.getOperand(0);
-          else {
+          } else {
             assert(N0.getOpcode() == ISD::AND &&
                     N0.getOperand(0).getOpcode() == ISD::XOR);
             // ((X^1)&1)^1 -> X & 1
@@ -1883,7 +1883,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
     // Canonicalize GE/LE comparisons to use GT/LT comparisons.
     if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
-      if (C1 == MinVal) return DAG.getConstant(1, dl, VT);  // X >= MIN --> true
+      // X >= MIN --> true
+      if (C1 == MinVal)
+        return DAG.getConstant(1, dl, VT);
+
       // X >= C0 --> X > (C0 - 1)
       APInt C = C1 - 1;
       ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
@@ -1898,7 +1901,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     }
 
     if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
-      if (C1 == MaxVal) return DAG.getConstant(1, dl, VT);  // X <= MAX --> true
+      // X <= MAX --> true
+      if (C1 == MaxVal)
+          return DAG.getConstant(1, dl, VT);
+
       // X <= C0 --> X < (C0 + 1)
       APInt C = C1 + 1;
       ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 900c0318b179..c43a5e18ad23 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -1456,6 +1456,7 @@ void TargetLoweringBase::computeRegisterProperties(
       }
       if (IsLegalWiderType)
         break;
+      LLVM_FALLTHROUGH;
     }
     case TypeWidenVector: {
       // Try to widen the vector.
@@ -1473,6 +1474,7 @@ void TargetLoweringBase::computeRegisterProperties(
       }
       if (IsLegalWiderType)
         break;
+      LLVM_FALLTHROUGH;
     }
     case TypeSplitVector:
     case TypeScalarizeVector: {
diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 282e3103adc9..711144fc2faa 100644
--- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -27,6 +27,14 @@ Error CodeViewRecordIO::beginRecord(Optional<uint32_t> MaxLength) {
 Error CodeViewRecordIO::endRecord() {
   assert(!Limits.empty() && "Not in a record!");
   Limits.pop_back();
+  // We would like to assert that we actually read / wrote all the bytes that we
+  // expected to for this record, but unfortunately we can't do this.  Some
+  // producers such as MASM over-allocate for certain types of records and
+  // commit the extraneous data, so when reading we can't be sure every byte
+  // will have been read.  And when writing we over-allocate temporarily since
+  // we don't know how big the record is until we're finished writing it, so
+  // even though we don't commit the extraneous data, we still can't guarantee
+  // we're at the end of the allocated data.
   return Error::success();
 }
 
@@ -49,6 +57,12 @@ uint32_t CodeViewRecordIO::maxFieldLength() const {
   return *Min;
 }
 
+Error CodeViewRecordIO::padToAlignment(uint32_t Align) {
+  if (isReading())
+    return Reader->padToAlignment(Align);
+  return Writer->padToAlignment(Align);
+}
+
 Error CodeViewRecordIO::skipPadding() {
   assert(!isWriting() && "Cannot skip padding while writing!");
 
diff --git a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
index b8741eb0b675..2e72242181b0 100644
--- a/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
+++ b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
@@ -72,7 +72,7 @@ Error DebugStringTableSubsection::commit(BinaryStreamWriter &Writer) const {
 uint32_t DebugStringTableSubsection::size() const { return Strings.size(); }
 
 uint32_t DebugStringTableSubsection::getStringId(StringRef S) const {
-  auto P = Strings.find(S);
-  assert(P != Strings.end());
-  return P->second;
+  auto Iter = Strings.find(S);
+  assert(Iter != Strings.end());
+  return Iter->second;
 }
diff --git a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
index 511f36d0020a..cfd1c5d3ab0c 100644
--- a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
+++ b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -16,14 +16,17 @@ using namespace llvm;
 using namespace llvm::codeview;
 
 DebugSubsectionRecord::DebugSubsectionRecord()
-    : Kind(DebugSubsectionKind::None) {}
+    : Container(CodeViewContainer::ObjectFile),
+      Kind(DebugSubsectionKind::None) {}
 
 DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind,
-                                             BinaryStreamRef Data)
-    : Kind(Kind), Data(Data) {}
+                                             BinaryStreamRef Data,
+                                             CodeViewContainer Container)
+    : Container(Container), Kind(Kind), Data(Data) {}
 
 Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream,
-                                        DebugSubsectionRecord &Info) {
+                                        DebugSubsectionRecord &Info,
+                                        CodeViewContainer Container) {
   const DebugSubsectionHeader *Header;
   BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(Header))
@@ -41,13 +44,14 @@ Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream,
   }
   if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
     return EC;
+  Info.Container = Container;
   Info.Kind = Kind;
   return Error::success();
 }
 
 uint32_t DebugSubsectionRecord::getRecordLength() const {
   uint32_t Result = sizeof(DebugSubsectionHeader) + Data.getLength();
-  assert(Result % 4 == 0);
+  assert(Result % alignOf(Container) == 0);
   return Result;
 }
 
@@ -56,25 +60,29 @@ DebugSubsectionKind DebugSubsectionRecord::kind() const { return Kind; }
 BinaryStreamRef DebugSubsectionRecord::getRecordData() const { return Data; }
 
 DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
-    DebugSubsectionKind Kind, DebugSubsection &Frag)
-    : Kind(Kind), Frag(Frag) {}
+    std::unique_ptr<DebugSubsection> Subsection, CodeViewContainer Container)
+    : Subsection(std::move(Subsection)), Container(Container) {}
 
 uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() {
-  uint32_t Size = sizeof(DebugSubsectionHeader) +
-                  alignTo(Frag.calculateSerializedSize(), 4);
+  uint32_t Size =
+      sizeof(DebugSubsectionHeader) +
+      alignTo(Subsection->calculateSerializedSize(), alignOf(Container));
   return Size;
 }
 
 Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) {
+  assert(Writer.getOffset() % alignOf(Container) == 0 &&
+         "Debug Subsection not properly aligned");
+
   DebugSubsectionHeader Header;
-  Header.Kind = uint32_t(Kind);
+  Header.Kind = uint32_t(Subsection->kind());
   Header.Length = calculateSerializedLength() - sizeof(DebugSubsectionHeader);
 
   if (auto EC = Writer.writeObject(Header))
     return EC;
-  if (auto EC = Frag.commit(Writer))
+  if (auto EC = Subsection->commit(Writer))
     return EC;
-  if (auto EC = Writer.padToAlignment(4))
+  if (auto EC = Writer.padToAlignment(alignOf(Container)))
     return EC;
 
   return Error::success();
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 3d49a7198d1a..66045933ce9b 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -668,7 +668,7 @@ Error CVSymbolDumperImpl::visitUnknownSymbol(CVSymbol &CVR) {
 
 Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
   SymbolVisitorCallbackPipeline Pipeline;
-  SymbolDeserializer Deserializer(ObjDelegate.get());
+  SymbolDeserializer Deserializer(ObjDelegate.get(), Container);
   CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
@@ -679,7 +679,7 @@ Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
 
 Error CVSymbolDumper::dump(const CVSymbolArray &Symbols) {
   SymbolVisitorCallbackPipeline Pipeline;
-  SymbolDeserializer Deserializer(ObjDelegate.get());
+  SymbolDeserializer Deserializer(ObjDelegate.get(), Container);
   CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
diff --git a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
index bb1731465495..ea46841a70f6 100644
--- a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
@@ -40,6 +40,7 @@ Error SymbolRecordMapping::visitSymbolBegin(CVSymbol &Record) {
 }
 
 Error SymbolRecordMapping::visitSymbolEnd(CVSymbol &Record) {
+  error(IO.padToAlignment(alignOf(Container)));
   error(IO.endRecord());
   return Error::success();
 }
diff --git a/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
index 251cc431f52b..9f2d619d1a1c 100644
--- a/lib/DebugInfo/CodeView/SymbolSerializer.cpp
+++ b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -12,9 +12,11 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator)
-  : Storage(Allocator), RecordBuffer(MaxRecordLength), Stream(RecordBuffer, llvm::support::little),
-  Writer(Stream), Mapping(Writer) { }
+SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator,
+                                   CodeViewContainer Container)
+    : Storage(Allocator), RecordBuffer(MaxRecordLength),
+      Stream(RecordBuffer, llvm::support::little), Writer(Stream),
+      Mapping(Writer, Container) {}
 
 Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
   assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!");
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index dfdeb8414212..faf2442bc94b 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -47,42 +47,46 @@ static Interval intersect(const Interval &I1, const Interval &I2) {
 
 MappedBlockStream::MappedBlockStream(uint32_t BlockSize,
                                      const MSFStreamLayout &Layout,
-                                     BinaryStreamRef MsfData)
-    : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData) {}
-
-std::unique_ptr<MappedBlockStream>
-MappedBlockStream::createStream(uint32_t BlockSize,
-                                const MSFStreamLayout &Layout,
-                                BinaryStreamRef MsfData) {
+                                     BinaryStreamRef MsfData,
+                                     BumpPtrAllocator &Allocator)
+    : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData),
+      Allocator(Allocator) {}
+
+std::unique_ptr<MappedBlockStream> MappedBlockStream::createStream(
+    uint32_t BlockSize, const MSFStreamLayout &Layout, BinaryStreamRef MsfData,
+    BumpPtrAllocator &Allocator) {
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
-      BlockSize, Layout, MsfData);
+      BlockSize, Layout, MsfData, Allocator);
 }
 
 std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
-    const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex) {
+    const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex,
+    BumpPtrAllocator &Allocator) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
-      Layout.SB->BlockSize, SL, MsfData);
+      Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
-                                         BinaryStreamRef MsfData) {
+                                         BinaryStreamRef MsfData,
+                                         BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
-  return createStream(Layout.SB->BlockSize, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                   BinaryStreamRef MsfData) {
+                                   BinaryStreamRef MsfData,
+                                   BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
-  return createStream(Layout.SB->BlockSize, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
@@ -148,7 +152,7 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
   // into it, and return an ArrayRef to that.  Do not touch existing pool
   // allocations, as existing clients may be holding a pointer which must
   // not be invalidated.
-  uint8_t *WriteBuffer = static_cast<uint8_t *>(Pool.Allocate(Size, 8));
+  uint8_t *WriteBuffer = static_cast<uint8_t *>(Allocator.Allocate(Size, 8));
   if (auto EC = readBytes(Offset, MutableArrayRef<uint8_t>(WriteBuffer, Size)))
     return EC;
 
@@ -269,10 +273,6 @@ Error MappedBlockStream::readBytes(uint32_t Offset,
   return Error::success();
 }
 
-uint32_t MappedBlockStream::getNumBytesCopied() const {
-  return static_cast<uint32_t>(Pool.getBytesAllocated());
-}
-
 void MappedBlockStream::invalidateCache() { CacheMap.shrink_and_clear(); }
 
 void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
@@ -313,43 +313,48 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
 
 WritableMappedBlockStream::WritableMappedBlockStream(
     uint32_t BlockSize, const MSFStreamLayout &Layout,
-    WritableBinaryStreamRef MsfData)
-    : ReadInterface(BlockSize, Layout, MsfData), WriteInterface(MsfData) {}
+    WritableBinaryStreamRef MsfData, BumpPtrAllocator &Allocator)
+    : ReadInterface(BlockSize, Layout, MsfData, Allocator),
+      WriteInterface(MsfData) {}
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createStream(uint32_t BlockSize,
                                         const MSFStreamLayout &Layout,
-                                        WritableBinaryStreamRef MsfData) {
+                                        WritableBinaryStreamRef MsfData,
+                                        BumpPtrAllocator &Allocator) {
   return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
-      BlockSize, Layout, MsfData);
+      BlockSize, Layout, MsfData, Allocator);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
                                                WritableBinaryStreamRef MsfData,
-                                               uint32_t StreamIndex) {
+                                               uint32_t StreamIndex,
+                                               BumpPtrAllocator &Allocator) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
-  return createStream(Layout.SB->BlockSize, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createDirectoryStream(
-    const MSFLayout &Layout, WritableBinaryStreamRef MsfData) {
+    const MSFLayout &Layout, WritableBinaryStreamRef MsfData,
+    BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
-  return createStream(Layout.SB->BlockSize, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                           WritableBinaryStreamRef MsfData) {
+                                           WritableBinaryStreamRef MsfData,
+                                           BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
-  return createStream(Layout.SB->BlockSize, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index b28ec2ff33ac..22c2ef31bd71 100644
--- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -66,7 +66,11 @@ void DbiModuleDescriptorBuilder::setObjFileName(StringRef Name) {
 
 void DbiModuleDescriptorBuilder::addSymbol(CVSymbol Symbol) {
   Symbols.push_back(Symbol);
-  SymbolByteSize += Symbol.data().size();
+  // Symbols written to a PDB file are required to be 4 byte aligned.  The same
+  // is not true of object files.
+  assert(Symbol.length() % alignOf(CodeViewContainer::Pdb) == 0 &&
+         "Invalid Symbol alignment!");
+  SymbolByteSize += Symbol.length();
 }
 
 void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) {
@@ -140,7 +144,7 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
 
   if (Layout.ModDiStream != kInvalidStreamIndex) {
     auto NS = WritableMappedBlockStream::createIndexedStream(
-        MsfLayout, MsfBuffer, Layout.ModDiStream);
+        MsfLayout, MsfBuffer, Layout.ModDiStream, MSF.getAllocator());
     WritableBinaryStreamRef Ref(*NS);
     BinaryStreamWriter SymbolWriter(Ref);
     // Write the symbols.
@@ -153,7 +157,8 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
     if (auto EC = SymbolWriter.writeStreamRef(RecordsRef))
       return EC;
     // TODO: Write C11 Line data
-
+    assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 &&
+           "Invalid debug section alignment!");
     for (const auto &Builder : C13Builders) {
       assert(Builder && "Empty C13 Fragment Builder!");
       if (auto EC = Builder->commit(SymbolWriter))
@@ -169,42 +174,9 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
   return Error::success();
 }
 
-void DbiModuleDescriptorBuilder::addC13Fragment(
-    std::unique_ptr<DebugLinesSubsection> Lines) {
-  DebugLinesSubsection &Frag = *Lines;
-
-  // File Checksums have to come first, so push an empty entry on if this
-  // is the first.
-  if (C13Builders.empty())
-    C13Builders.push_back(nullptr);
-
-  this->LineInfo.push_back(std::move(Lines));
-  C13Builders.push_back(
-      llvm::make_unique<DebugSubsectionRecordBuilder>(Frag.kind(), Frag));
-}
-
-void DbiModuleDescriptorBuilder::addC13Fragment(
-    std::unique_ptr<codeview::DebugInlineeLinesSubsection> Inlinees) {
-  DebugInlineeLinesSubsection &Frag = *Inlinees;
-
-  // File Checksums have to come first, so push an empty entry on if this
-  // is the first.
-  if (C13Builders.empty())
-    C13Builders.push_back(nullptr);
-
-  this->Inlinees.push_back(std::move(Inlinees));
-  C13Builders.push_back(
-      llvm::make_unique<DebugSubsectionRecordBuilder>(Frag.kind(), Frag));
-}
-
-void DbiModuleDescriptorBuilder::setC13FileChecksums(
-    std::unique_ptr<DebugChecksumsSubsection> Checksums) {
-  assert(!ChecksumInfo && "Can't have more than one checksum info!");
-
-  if (C13Builders.empty())
-    C13Builders.push_back(nullptr);
-
-  ChecksumInfo = std::move(Checksums);
-  C13Builders[0] = llvm::make_unique<DebugSubsectionRecordBuilder>(
-      ChecksumInfo->kind(), *ChecksumInfo);
+void DbiModuleDescriptorBuilder::addDebugSubsection(
+    std::unique_ptr<DebugSubsection> Subsection) {
+  assert(Subsection);
+  C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
+      std::move(Subsection), CodeViewContainer::Pdb));
 }
diff --git a/lib/DebugInfo/PDB/Native/DbiStream.cpp b/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 2f4fb6cc295d..320b11dc5cab 100644
--- a/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -252,7 +252,7 @@ Error DbiStream::initializeSectionHeadersData() {
     return make_error<RawError>(raw_error_code::no_stream);
 
   auto SHS = MappedBlockStream::createIndexedStream(
-      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum);
+      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator());
 
   size_t StreamLen = SHS->getLength();
   if (StreamLen % sizeof(object::coff_section))
@@ -284,7 +284,7 @@ Error DbiStream::initializeFpoRecords() {
     return make_error<RawError>(raw_error_code::no_stream);
 
   auto FS = MappedBlockStream::createIndexedStream(
-      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum);
+      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator());
 
   size_t StreamLen = FS->getLength();
   if (StreamLen % sizeof(object::FpoData))
diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 23c7456d7772..55c20fdb9af6 100644
--- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -357,8 +357,8 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = finalize())
     return EC;
 
-  auto DbiS = WritableMappedBlockStream::createIndexedStream(Layout, MsfBuffer,
-                                                             StreamDBI);
+  auto DbiS = WritableMappedBlockStream::createIndexedStream(
+      Layout, MsfBuffer, StreamDBI, Allocator);
 
   BinaryStreamWriter Writer(*DbiS);
   if (auto EC = Writer.writeObject(*Header))
@@ -396,7 +396,7 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
     if (Stream.StreamNumber == kInvalidStreamIndex)
       continue;
     auto WritableStream = WritableMappedBlockStream::createIndexedStream(
-        Layout, MsfBuffer, Stream.StreamNumber);
+        Layout, MsfBuffer, Stream.StreamNumber, Allocator);
     BinaryStreamWriter DbgStreamWriter(*WritableStream);
     if (auto EC = DbgStreamWriter.writeArray(Stream.Data))
       return EC;
diff --git a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index f019d410328a..707128f7efd4 100644
--- a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -50,8 +50,8 @@ Error InfoStreamBuilder::finalizeMsfLayout() {
 
 Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
                                 WritableBinaryStreamRef Buffer) const {
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamPDB);
+  auto InfoS = WritableMappedBlockStream::createIndexedStream(
+      Layout, Buffer, StreamPDB, Msf.getAllocator());
   BinaryStreamWriter Writer(*InfoS);
 
   InfoStreamHeader H;
diff --git a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
index d7a203746a0d..c4ff30011a17 100644
--- a/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
+++ b/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
@@ -55,9 +55,9 @@ Error ModuleDebugStreamRef::reload() {
   if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size))
     return EC;
 
-  BinaryStreamReader LineReader(C13LinesSubstream);
-  if (auto EC =
-          LineReader.readArray(LinesAndChecksums, LineReader.bytesRemaining()))
+  BinaryStreamReader SubsectionsReader(C13LinesSubstream);
+  if (auto EC = SubsectionsReader.readArray(Subsections,
+                                            SubsectionsReader.bytesRemaining()))
     return EC;
 
   uint32_t GlobalRefsSize;
@@ -77,13 +77,27 @@ ModuleDebugStreamRef::symbols(bool *HadError) const {
   return make_range(SymbolsSubstream.begin(HadError), SymbolsSubstream.end());
 }
 
-llvm::iterator_range<ModuleDebugStreamRef::LinesAndChecksumsIterator>
-ModuleDebugStreamRef::linesAndChecksums() const {
-  return make_range(LinesAndChecksums.begin(), LinesAndChecksums.end());
+llvm::iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator>
+ModuleDebugStreamRef::subsections() const {
+  return make_range(Subsections.begin(), Subsections.end());
 }
 
-bool ModuleDebugStreamRef::hasLineInfo() const {
+bool ModuleDebugStreamRef::hasDebugSubsections() const {
   return C13LinesSubstream.getLength() > 0;
 }
 
 Error ModuleDebugStreamRef::commit() { return Error::success(); }
+
+Expected<codeview::DebugChecksumsSubsectionRef>
+ModuleDebugStreamRef::findChecksumsSubsection() const {
+  for (const auto &SS : subsections()) {
+    if (SS.kind() != DebugSubsectionKind::FileChecksums)
+      continue;
+
+    codeview::DebugChecksumsSubsectionRef Result;
+    if (auto EC = Result.initialize(SS.getRecordData()))
+      return std::move(EC);
+    return Result;
+  }
+  return make_error<RawError>(raw_error_code::no_entry);
+}
diff --git a/lib/DebugInfo/PDB/Native/PDBFile.cpp b/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 859295d2c7d3..1254e23c73eb 100644
--- a/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -146,7 +146,8 @@ Error PDBFile::parseFileHeaders() {
   // at getBlockSize() intervals, so we have to be compatible.
   // See the function fpmPn() for more information:
   // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L489
-  auto FpmStream = MappedBlockStream::createFpmStream(ContainerLayout, *Buffer);
+  auto FpmStream =
+      MappedBlockStream::createFpmStream(ContainerLayout, *Buffer, Allocator);
   BinaryStreamReader FpmReader(*FpmStream);
   ArrayRef<uint8_t> FpmBytes;
   if (auto EC = FpmReader.readBytes(FpmBytes,
@@ -184,7 +185,8 @@ Error PDBFile::parseStreamData() {
   // is exactly what we are attempting to parse.  By specifying a custom
   // subclass of IPDBStreamData which only accesses the fields that have already
   // been parsed, we can avoid this and reuse MappedBlockStream.
-  auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer);
+  auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer,
+                                                     Allocator);
   BinaryStreamReader Reader(*DS);
   if (auto EC = Reader.readInteger(NumStreams))
     return EC;
@@ -407,5 +409,6 @@ PDBFile::safelyCreateIndexedStream(const MSFLayout &Layout,
                                    uint32_t StreamIndex) const {
   if (StreamIndex >= getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
-  return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex);
+  return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex,
+                                                Allocator);
 }
diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index c6568029ec55..2c6465e6fb2a 100644
--- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -140,8 +140,8 @@ Error PDBFileBuilder::commit(StringRef Filename) {
   if (auto EC = Writer.writeArray(Layout.DirectoryBlocks))
     return EC;
 
-  auto DirStream =
-      WritableMappedBlockStream::createDirectoryStream(Layout, Buffer);
+  auto DirStream = WritableMappedBlockStream::createDirectoryStream(
+      Layout, Buffer, Allocator);
   BinaryStreamWriter DW(*DirStream);
   if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size()))
     return EC;
@@ -158,8 +158,8 @@ Error PDBFileBuilder::commit(StringRef Filename) {
   if (!ExpectedSN)
     return ExpectedSN.takeError();
 
-  auto NS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
-                                                           *ExpectedSN);
+  auto NS = WritableMappedBlockStream::createIndexedStream(
+      Layout, Buffer, *ExpectedSN, Allocator);
   BinaryStreamWriter NSWriter(*NS);
   if (auto EC = Strings.commit(NSWriter))
     return EC;
diff --git a/lib/DebugInfo/PDB/Native/PDBStringTable.cpp b/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
index e84573fe07b8..6013c342cf02 100644
--- a/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
@@ -56,6 +56,10 @@ Error PDBStringTable::readStrings(BinaryStreamReader &Reader) {
   return Error::success();
 }
 
+codeview::DebugStringTableSubsectionRef PDBStringTable::getStringTable() const {
+  return Strings;
+}
+
 Error PDBStringTable::readHashTable(BinaryStreamReader &Reader) {
   const support::ulittle32_t *HashCount;
   if (auto EC = Reader.readObject(HashCount))
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 623afb371b50..67c803d3124e 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -32,8 +32,7 @@ using namespace llvm::support;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-TpiStream::TpiStream(const PDBFile &File,
-                     std::unique_ptr<MappedBlockStream> Stream)
+TpiStream::TpiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
     : Pdb(File), Stream(std::move(Stream)) {}
 
 TpiStream::~TpiStream() = default;
@@ -77,7 +76,8 @@ Error TpiStream::reload() {
                                   "Invalid TPI hash stream index.");
 
     auto HS = MappedBlockStream::createIndexedStream(
-        Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex);
+        Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex,
+        Pdb.getAllocator());
     BinaryStreamReader HSR(*HS);
 
     // There should be a hash value for every type record, or no hashes at all.
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 20456cc97823..9e943c7f114d 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -147,8 +147,8 @@ Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = finalize())
     return EC;
 
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, Idx);
+  auto InfoS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
+                                                              Idx, Allocator);
 
   BinaryStreamWriter Writer(*InfoS);
   if (auto EC = Writer.writeObject(*Header))
@@ -159,8 +159,8 @@ Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
       return EC;
 
   if (HashStreamIndex != kInvalidStreamIndex) {
-    auto HVS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
-                                                              HashStreamIndex);
+    auto HVS = WritableMappedBlockStream::createIndexedStream(
+        Layout, Buffer, HashStreamIndex, Allocator);
     BinaryStreamWriter HW(*HVS);
     if (HashValueStream) {
       if (auto EC = HW.writeStreamRef(*HashValueStream))
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 7e6f9a7804b9..7754ac03b43d 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -39,6 +39,21 @@ void DIBuilder::trackIfUnresolved(MDNode *N) {
   UnresolvedNodes.emplace_back(N);
 }
 
+void DIBuilder::finalizeSubprogram(DISubprogram *SP) {
+  MDTuple *Temp = SP->getVariables().get();
+  if (!Temp || !Temp->isTemporary())
+    return;
+
+  SmallVector<Metadata *, 4> Variables;
+
+  auto PV = PreservedVariables.find(SP);
+  if (PV != PreservedVariables.end())
+    Variables.append(PV->second.begin(), PV->second.end());
+
+  DINodeArray AV = getOrCreateArray(Variables);
+  TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
+}
+
 void DIBuilder::finalize() {
   if (!CUNode) {
     assert(!AllowUnresolvedNodes &&
@@ -62,25 +77,11 @@ void DIBuilder::finalize() {
     CUNode->replaceRetainedTypes(MDTuple::get(VMContext, RetainValues));
 
   DISubprogramArray SPs = MDTuple::get(VMContext, AllSubprograms);
-  auto resolveVariables = [&](DISubprogram *SP) {
-    MDTuple *Temp = SP->getVariables().get();
-    if (!Temp)
-      return;
-
-    SmallVector<Metadata *, 4> Variables;
-
-    auto PV = PreservedVariables.find(SP);
-    if (PV != PreservedVariables.end())
-      Variables.append(PV->second.begin(), PV->second.end());
-
-    DINodeArray AV = getOrCreateArray(Variables);
-    TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
-  };
   for (auto *SP : SPs)
-    resolveVariables(SP);
+    finalizeSubprogram(SP);
   for (auto *N : RetainValues)
     if (auto *SP = dyn_cast<DISubprogram>(N))
-      resolveVariables(SP);
+      finalizeSubprogram(SP);
 
   if (!AllGVs.empty())
     CUNode->replaceGlobalVariables(MDTuple::get(VMContext, AllGVs));
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index b7e3f0c6779e..0485fece7c42 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -99,87 +99,6 @@ DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
   return Last;
 }
 
-/// Reparent \c Scope from \c OrigSP to \c NewSP.
-static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope,
-                              DISubprogram *OrigSP, DISubprogram *NewSP,
-                              DenseMap<const MDNode *, MDNode *> &Cache) {
-  SmallVector<DIScope *, 3> ScopeChain;
-  DIScope *Last = NewSP;
-  DIScope *CurScope = Scope;
-  do {
-    if (auto *SP = dyn_cast<DISubprogram>(CurScope)) {
-      // Don't rewrite this scope chain if it doesn't lead to the replaced SP.
-      if (SP != OrigSP)
-        return Scope;
-      Cache.insert({OrigSP, NewSP});
-      break;
-    }
-    if (auto *Found = Cache[CurScope]) {
-      Last = cast<DIScope>(Found);
-      break;
-    }
-    ScopeChain.push_back(CurScope);
-  } while ((CurScope = CurScope->getScope().resolve()));
-
-  // Starting from the top, rebuild the nodes to point to the new inlined-at
-  // location (then rebuilding the rest of the chain behind it) and update the
-  // map of already-constructed inlined-at nodes.
-  for (const DIScope *MD : reverse(ScopeChain)) {
-    if (auto *LB = dyn_cast<DILexicalBlock>(MD))
-      Cache[MD] = Last = DILexicalBlock::getDistinct(
-          Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn());
-    else if (auto *LB = dyn_cast<DILexicalBlockFile>(MD))
-      Cache[MD] = Last = DILexicalBlockFile::getDistinct(
-          Ctx, Last, LB->getFile(), LB->getDiscriminator());
-    else
-      llvm_unreachable("illegal parent scope");
-  }
-  return Last;
-}
-
-void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
-                                 DISubprogram *NewSP,
-                                 DenseMap<const MDNode *, MDNode *> &Cache) {
-  auto DL = I.getDebugLoc();
-  if (!OrigSP || !NewSP || OrigSP == NewSP || !DL)
-    return;
-
-  // Reparent the debug location.
-  auto &Ctx = I.getContext();
-  DILocation *InlinedAt = DL->getInlinedAt();
-  if (InlinedAt) {
-    while (auto *IA = InlinedAt->getInlinedAt())
-      InlinedAt = IA;
-    auto NewScope =
-        reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache);
-    InlinedAt =
-        DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope);
-  }
-  I.setDebugLoc(
-      DebugLoc::get(DL.getLine(), DL.getCol(),
-                    reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache),
-                    DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache,
-                                              ReplaceLastInlinedAt)));
-
-  // Fix up debug variables to point to NewSP.
-  auto reparentVar = [&](DILocalVariable *Var) {
-    return DILocalVariable::get(
-        Ctx,
-        cast<DILocalScope>(
-            reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)),
-        Var->getName(), Var->getFile(), Var->getLine(), Var->getType(),
-        Var->getArg(), Var->getFlags(), Var->getAlignInBits());
-  };
-  if (auto *DbgValue = dyn_cast<DbgValueInst>(&I)) {
-    auto *Var = DbgValue->getVariable();
-    I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var)));
-  } else if (auto *DbgDeclare = dyn_cast<DbgDeclareInst>(&I)) {
-    auto *Var = DbgDeclare->getVariable();
-    I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var)));
-  }
-}
-
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const {
   if (!Loc)
diff --git a/lib/IR/OptBisect.cpp b/lib/IR/OptBisect.cpp
index b670c817569a..a03a6fb62237 100644
--- a/lib/IR/OptBisect.cpp
+++ b/lib/IR/OptBisect.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/Pass.h"
@@ -53,13 +54,20 @@ static std::string getDescription(const BasicBlock &BB) {
 }
 
 static std::string getDescription(const Loop &L) {
-  // FIXME: I'd like to be able to provide a better description here, but
-  //        calling L->getHeader() would introduce a new dependency on the
-  //        LLVMCore library.
+  // FIXME: Move into LoopInfo so we can get a better description
+  // (and avoid a circular dependency between IR and Analysis).
   return "loop";
 }
 
+static std::string getDescription(const Region &R) {
+  // FIXME: Move into RegionInfo so we can get a better description
+  // (and avoid a circular dependency between IR and Analysis).
+  return "region";
+}
+
 static std::string getDescription(const CallGraphSCC &SCC) {
+  // FIXME: Move into CallGraphSCCPass to avoid circular dependency between
+  // IR and Analysis.
   std::string Desc = "SCC (";
   bool First = true;
   for (CallGraphNode *CGN : SCC) {
@@ -83,6 +91,7 @@ template bool OptBisect::shouldRunPass(const Pass *, const Function &);
 template bool OptBisect::shouldRunPass(const Pass *, const BasicBlock &);
 template bool OptBisect::shouldRunPass(const Pass *, const Loop &);
 template bool OptBisect::shouldRunPass(const Pass *, const CallGraphSCC &);
+template bool OptBisect::shouldRunPass(const Pass *, const Region &);
 
 template <class UnitT>
 bool OptBisect::shouldRunPass(const Pass *P, const UnitT &U) {
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 9efc095f9fcf..92145aaf667a 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -122,6 +122,7 @@ static void computeCacheKey(
   AddUnsigned(Conf.CGOptLevel);
   AddUnsigned(Conf.CGFileType);
   AddUnsigned(Conf.OptLevel);
+  AddUnsigned(Conf.UseNewPM);
   AddString(Conf.OptPipeline);
   AddString(Conf.AAPipeline);
   AddString(Conf.OverrideTriple);
@@ -621,6 +622,19 @@ unsigned LTO::getMaxTasks() const {
 }
 
 Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
+  // Compute "dead" symbols, we don't want to import/export these!
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+  for (auto &Res : GlobalResolutions) {
+    if (Res.second.VisibleOutsideThinLTO &&
+        // IRName will be defined if we have seen the prevailing copy of
+        // this value. If not, no need to preserve any ThinLTO copies.
+        !Res.second.IRName.empty())
+      GUIDPreservedSymbols.insert(GlobalValue::getGUID(
+          GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
+  }
+
+  computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
+
   // Save the status of having a regularLTO combined module, as
   // this is needed for generating the ThinLTO Task ID, and
   // the CombinedModule will be moved at the end of runRegularLTO.
@@ -930,6 +944,17 @@ ThinBackend lto::createWriteIndexesThinBackend(std::string OldPrefix,
   };
 }
 
+static bool IsLiveByGUID(const ModuleSummaryIndex &Index,
+                         GlobalValue::GUID GUID) {
+  auto VI = Index.getValueInfo(GUID);
+  if (!VI)
+    return false;
+  for (auto &I : VI.getSummaryList())
+    if (Index.isGlobalValueLive(I.get()))
+      return true;
+  return false;
+}
+
 Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
                       bool HasRegularLTO) {
   if (ThinLTO.ModuleMap.empty())
@@ -962,22 +987,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
 
   if (Conf.OptLevel > 0) {
-    // Compute "dead" symbols, we don't want to import/export these!
-    DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
-    for (auto &Res : GlobalResolutions) {
-      if (Res.second.VisibleOutsideThinLTO &&
-          // IRName will be defined if we have seen the prevailing copy of
-          // this value. If not, no need to preserve any ThinLTO copies.
-          !Res.second.IRName.empty())
-        GUIDPreservedSymbols.insert(GlobalValue::getGUID(
-            GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
-    }
-
-    auto DeadSymbols =
-        computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
-
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                             ImportLists, ExportLists, &DeadSymbols);
+                             ImportLists, ExportLists);
 
     std::set<GlobalValue::GUID> ExportedGUIDs;
     for (auto &Res : GlobalResolutions) {
@@ -992,7 +1003,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
       auto GUID = GlobalValue::getGUID(
           GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
       // Mark exported unless index-based analysis determined it to be dead.
-      if (!DeadSymbols.count(GUID))
+      if (IsLiveByGUID(ThinLTO.CombinedIndex, GUID))
         ExportedGUIDs.insert(GUID);
     }
 
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index f9c41f5c9744..3f72e446cdf2 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -42,11 +42,6 @@
 using namespace llvm;
 using namespace lto;
 
-static cl::opt<bool>
-    LTOUseNewPM("lto-use-new-pm",
-                cl::desc("Run LTO passes using the new pass manager"),
-                cl::init(false), cl::Hidden);
-
 LLVM_ATTRIBUTE_NORETURN static void reportOpenError(StringRef Path, Twine Msg) {
   errs() << "failed to open " << Path << ": " << Msg << '\n';
   errs().flush();
@@ -266,7 +261,7 @@ bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
   if (!Conf.OptPipeline.empty())
     runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
                          Conf.DisableVerify);
-  else if (LTOUseNewPM)
+  else if (Conf.UseNewPM)
     runNewPMPasses(Mod, TM, Conf.OptLevel, IsThinLTO);
   else
     runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary);
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index ca3fc60f9501..6b221a347c17 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -628,13 +628,13 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
       PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbols(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
 
   // Resolve LinkOnce/Weak symbols.
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
@@ -673,13 +673,13 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
       PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbols(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
   auto &ImportList = ImportLists[TheModule.getModuleIdentifier()];
 
   crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
@@ -750,13 +750,13 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbols(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
   auto &ExportList = ExportLists[ModuleIdentifier];
 
   // Be friendly and don't nuke totally the module when the client didn't
@@ -902,14 +902,14 @@ void ThinLTOCodeGenerator::run() {
       computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(*Index, GUIDPreservedSymbols);
+  computeDeadSymbols(*Index, GUIDPreservedSymbols);
 
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
 
   // We use a std::map here to be able to have a defined ordering when
   // producing a hash for the cache entry.
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index 8c3df36cfb48..9b2031f05043 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -40,6 +40,7 @@ using namespace llvm;
 #define DEBUG_TYPE "reloc-info"
 
 namespace {
+
 // For patching purposes, we need to remember where each section starts, both
 // for patching up the section size field, and for patching up references to
 // locations within the section.
@@ -50,6 +51,82 @@ struct SectionBookkeeping {
   uint64_t ContentsOffset;
 };
 
+// The signature of a wasm function, in a struct capable of being used as a
+// DenseMap key.
+struct WasmFunctionType {
+  // Support empty and tombstone instances, needed by DenseMap.
+  enum { Plain, Empty, Tombstone } State;
+
+  // The return types of the function.
+  SmallVector<wasm::ValType, 1> Returns;
+
+  // The parameter types of the function.
+  SmallVector<wasm::ValType, 4> Params;
+
+  WasmFunctionType() : State(Plain) {}
+
+  bool operator==(const WasmFunctionType &Other) const {
+    return State == Other.State && Returns == Other.Returns &&
+           Params == Other.Params;
+  }
+};
+
+// Traits for using WasmFunctionType in a DenseMap.
+struct WasmFunctionTypeDenseMapInfo {
+  static WasmFunctionType getEmptyKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Empty;
+    return FuncTy;
+  }
+  static WasmFunctionType getTombstoneKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Tombstone;
+    return FuncTy;
+  }
+  static unsigned getHashValue(const WasmFunctionType &FuncTy) {
+    uintptr_t Value = FuncTy.State;
+    for (wasm::ValType Ret : FuncTy.Returns)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
+    for (wasm::ValType Param : FuncTy.Params)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
+    return Value;
+  }
+  static bool isEqual(const WasmFunctionType &LHS,
+                      const WasmFunctionType &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// A wasm import to be written into the import section.
+struct WasmImport {
+  StringRef ModuleName;
+  StringRef FieldName;
+  unsigned Kind;
+  int32_t Type;
+};
+
+// A wasm function to be written into the function section.
+struct WasmFunction {
+  int32_t Type;
+  const MCSymbolWasm *Sym;
+};
+
+// A wasm export to be written into the export section.
+struct WasmExport {
+  StringRef FieldName;
+  unsigned Kind;
+  uint32_t Index;
+};
+
+// A wasm global to be written into the global section.
+struct WasmGlobal {
+  wasm::ValType Type;
+  bool IsMutable;
+  bool HasImport;
+  uint64_t InitialValue;
+  uint32_t ImportIndex;
+};
+
 class WasmObjectWriter : public MCObjectWriter {
   /// Helper struct for containing some precomputed information on symbols.
   struct WasmSymbolData {
@@ -91,18 +168,10 @@ public:
       : MCObjectWriter(OS, /*IsLittleEndian=*/true), TargetObjectWriter(MOTW) {}
 
 private:
-  void reset() override {
-    MCObjectWriter::reset();
-  }
-
   ~WasmObjectWriter() override;
 
   void writeHeader(const MCAssembler &Asm);
 
-  void writeValueType(wasm::ValType Ty) {
-    encodeSLEB128(int32_t(Ty), getStream());
-  }
-
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
@@ -112,7 +181,37 @@ private:
                                 const MCAsmLayout &Layout) override;
 
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+
+  void writeValueType(wasm::ValType Ty) {
+    encodeSLEB128(int32_t(Ty), getStream());
+  }
+
+  void writeTypeSection(const SmallVector<WasmFunctionType, 4> &FunctionTypes);
+  void writeImportSection(const SmallVector<WasmImport, 4> &Imports);
+  void writeFunctionSection(const SmallVector<WasmFunction, 4> &Functions);
+  void writeTableSection(const SmallVector<uint32_t, 4> &TableElems);
+  void writeMemorySection(const SmallVector<char, 0> &DataBytes);
+  void writeGlobalSection(const SmallVector<WasmGlobal, 4> &Globals);
+  void writeExportSection(const SmallVector<WasmExport, 4> &Exports);
+  void writeElemSection(const SmallVector<uint32_t, 4> &TableElems);
+  void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices,
+                        const SmallVector<WasmFunction, 4> &Functions);
+  uint64_t
+  writeDataSection(const SmallVector<char, 0> &DataBytes,
+                   DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices);
+  void writeNameSection(const SmallVector<WasmFunction, 4> &Functions,
+                        const SmallVector<WasmImport, 4> &Imports,
+                        uint32_t NumFuncImports);
+  void writeCodeRelocSection(
+      DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices);
+  void writeDataRelocSection(
+      DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices,
+      uint64_t DataSectionHeaderSize);
+  void writeLinkingMetaDataSection(bool HasStackPointer,
+                                   uint32_t StackPointerGlobal);
 };
+
 } // end anonymous namespace
 
 WasmObjectWriter::~WasmObjectWriter() {}
@@ -278,86 +377,6 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
     DataRelocations.push_back(Rec);
 }
 
-namespace {
-
-// The signature of a wasm function, in a struct capable of being used as a
-// DenseMap key.
-struct WasmFunctionType {
-  // Support empty and tombstone instances, needed by DenseMap.
-  enum { Plain, Empty, Tombstone } State;
-
-  // The return types of the function.
-  SmallVector<wasm::ValType, 1> Returns;
-
-  // The parameter types of the function.
-  SmallVector<wasm::ValType, 4> Params;
-
-  WasmFunctionType() : State(Plain) {}
-
-  bool operator==(const WasmFunctionType &Other) const {
-    return State == Other.State && Returns == Other.Returns &&
-           Params == Other.Params;
-  }
-};
-
-// Traits for using WasmFunctionType in a DenseMap.
-struct WasmFunctionTypeDenseMapInfo {
-  static WasmFunctionType getEmptyKey() {
-    WasmFunctionType FuncTy;
-    FuncTy.State = WasmFunctionType::Empty;
-    return FuncTy;
-  }
-  static WasmFunctionType getTombstoneKey() {
-    WasmFunctionType FuncTy;
-    FuncTy.State = WasmFunctionType::Tombstone;
-    return FuncTy;
-  }
-  static unsigned getHashValue(const WasmFunctionType &FuncTy) {
-    uintptr_t Value = FuncTy.State;
-    for (wasm::ValType Ret : FuncTy.Returns)
-      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
-    for (wasm::ValType Param : FuncTy.Params)
-      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
-    return Value;
-  }
-  static bool isEqual(const WasmFunctionType &LHS,
-                      const WasmFunctionType &RHS) {
-    return LHS == RHS;
-  }
-};
-
-// A wasm import to be written into the import section.
-struct WasmImport {
-  StringRef ModuleName;
-  StringRef FieldName;
-  unsigned Kind;
-  int32_t Type;
-};
-
-// A wasm function to be written into the function section.
-struct WasmFunction {
-  int32_t Type;
-  const MCSymbolWasm *Sym;
-};
-
-// A wasm export to be written into the export section.
-struct WasmExport {
-  StringRef FieldName;
-  unsigned Kind;
-  uint32_t Index;
-};
-
-// A wasm global to be written into the global section.
-struct WasmGlobal {
-  wasm::ValType Type;
-  bool IsMutable;
-  bool HasImport;
-  uint64_t InitialValue;
-  uint32_t ImportIndex;
-};
-
-} // end anonymous namespace
-
 // Write X as an (unsigned) LEB value at offset Offset in Stream, padded
 // to allow patching.
 static void
@@ -529,6 +548,367 @@ static void WriteTypeRelocations(
   }
 }
 
+void WasmObjectWriter::writeTypeSection(
+    const SmallVector<WasmFunctionType, 4> &FunctionTypes) {
+  if (FunctionTypes.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_TYPE);
+
+  encodeULEB128(FunctionTypes.size(), getStream());
+
+  for (const WasmFunctionType &FuncTy : FunctionTypes) {
+    encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream());
+    encodeULEB128(FuncTy.Params.size(), getStream());
+    for (wasm::ValType Ty : FuncTy.Params)
+      writeValueType(Ty);
+    encodeULEB128(FuncTy.Returns.size(), getStream());
+    for (wasm::ValType Ty : FuncTy.Returns)
+      writeValueType(Ty);
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeImportSection(
+    const SmallVector<WasmImport, 4> &Imports) {
+  if (Imports.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_IMPORT);
+
+  encodeULEB128(Imports.size(), getStream());
+  for (const WasmImport &Import : Imports) {
+    StringRef ModuleName = Import.ModuleName;
+    encodeULEB128(ModuleName.size(), getStream());
+    writeBytes(ModuleName);
+
+    StringRef FieldName = Import.FieldName;
+    encodeULEB128(FieldName.size(), getStream());
+    writeBytes(FieldName);
+
+    encodeULEB128(Import.Kind, getStream());
+
+    switch (Import.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      encodeULEB128(Import.Type, getStream());
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      encodeSLEB128(int32_t(Import.Type), getStream());
+      encodeULEB128(0, getStream()); // mutability
+      break;
+    default:
+      llvm_unreachable("unsupported import kind");
+    }
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeFunctionSection(
+    const SmallVector<WasmFunction, 4> &Functions) {
+  if (Functions.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_FUNCTION);
+
+  encodeULEB128(Functions.size(), getStream());
+  for (const WasmFunction &Func : Functions)
+    encodeULEB128(Func.Type, getStream());
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeTableSection(
+    const SmallVector<uint32_t, 4> &TableElems) {
+  // For now, always emit the table section, since indirect calls are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no indirect calls.
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_TABLE);
+
+  // The number of tables, fixed to 1 for now.
+  encodeULEB128(1, getStream());
+
+  encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream());
+
+  encodeULEB128(0, getStream());                 // flags
+  encodeULEB128(TableElems.size(), getStream()); // initial
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeMemorySection(
+    const SmallVector<char, 0> &DataBytes) {
+  // For now, always emit the memory section, since loads and stores are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no loads or stores.
+  SectionBookkeeping Section;
+  uint32_t NumPages =
+      (DataBytes.size() + wasm::WasmPageSize - 1) / wasm::WasmPageSize;
+
+  startSection(Section, wasm::WASM_SEC_MEMORY);
+  encodeULEB128(1, getStream()); // number of memory spaces
+
+  encodeULEB128(0, getStream()); // flags
+  encodeULEB128(NumPages, getStream()); // initial
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeGlobalSection(
+    const SmallVector<WasmGlobal, 4> &Globals) {
+  if (Globals.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_GLOBAL);
+
+  encodeULEB128(Globals.size(), getStream());
+  for (const WasmGlobal &Global : Globals) {
+    writeValueType(Global.Type);
+    write8(Global.IsMutable);
+
+    if (Global.HasImport) {
+      assert(Global.InitialValue == 0);
+      write8(wasm::WASM_OPCODE_GET_GLOBAL);
+      encodeULEB128(Global.ImportIndex, getStream());
+    } else {
+      assert(Global.ImportIndex == 0);
+      write8(wasm::WASM_OPCODE_I32_CONST);
+      encodeSLEB128(Global.InitialValue, getStream()); // offset
+    }
+    write8(wasm::WASM_OPCODE_END);
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeExportSection(
+    const SmallVector<WasmExport, 4> &Exports) {
+  if (Exports.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_EXPORT);
+
+  encodeULEB128(Exports.size(), getStream());
+  for (const WasmExport &Export : Exports) {
+    encodeULEB128(Export.FieldName.size(), getStream());
+    writeBytes(Export.FieldName);
+
+    encodeSLEB128(Export.Kind, getStream());
+
+    encodeULEB128(Export.Index, getStream());
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeElemSection(
+    const SmallVector<uint32_t, 4> &TableElems) {
+  if (TableElems.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_ELEM);
+
+  encodeULEB128(1, getStream()); // number of "segments"
+  encodeULEB128(0, getStream()); // the table index
+
+  // init expr for starting offset
+  write8(wasm::WASM_OPCODE_I32_CONST);
+  encodeSLEB128(0, getStream());
+  write8(wasm::WASM_OPCODE_END);
+
+  encodeULEB128(TableElems.size(), getStream());
+  for (uint32_t Elem : TableElems)
+    encodeULEB128(Elem, getStream());
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeCodeSection(
+    const MCAssembler &Asm, const MCAsmLayout &Layout,
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices,
+    const SmallVector<WasmFunction, 4> &Functions) {
+  if (Functions.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CODE);
+
+  encodeULEB128(Functions.size(), getStream());
+
+  for (const WasmFunction &Func : Functions) {
+    MCSectionWasm &FuncSection =
+        static_cast<MCSectionWasm &>(Func.Sym->getSection());
+
+    if (Func.Sym->isVariable())
+      report_fatal_error("weak symbols not supported yet");
+
+    if (Func.Sym->getOffset() != 0)
+      report_fatal_error("function sections must contain one function each");
+
+    if (!Func.Sym->getSize())
+      report_fatal_error("function symbols must have a size set with .size");
+
+    int64_t Size = 0;
+    if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
+      report_fatal_error(".size expression must be evaluatable");
+
+    encodeULEB128(Size, getStream());
+
+    FuncSection.setSectionOffset(getStream().tell() -
+                                 Section.ContentsOffset);
+
+    Asm.writeSectionData(&FuncSection, Layout);
+  }
+
+  // Apply the type index fixups for call_indirect etc. instructions.
+  for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) {
+    uint32_t Type = TypeIndexFixupTypes[i];
+    unsigned Padding = PaddingFor5ByteULEB128(Type);
+
+    const WasmRelocationEntry &Fixup = TypeIndexFixups[i];
+    assert(Fixup.Addend == 0);
+    assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+    uint64_t Offset = Fixup.Offset +
+                      Fixup.FixupSection->getSectionOffset();
+
+    uint8_t Buffer[16];
+    unsigned SizeLen = encodeULEB128(Type, Buffer, Padding);
+    assert(SizeLen == 5);
+    getStream().pwrite((char *)Buffer, SizeLen,
+                       Section.ContentsOffset + Offset);
+  }
+
+  // Apply fixups.
+  ApplyRelocations(CodeRelocations, getStream(), SymbolIndices,
+                   Section.ContentsOffset);
+
+  endSection(Section);
+}
+
+uint64_t WasmObjectWriter::writeDataSection(
+    const SmallVector<char, 0> &DataBytes,
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices) {
+  if (DataBytes.empty())
+    return 0;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_DATA);
+
+  encodeULEB128(1, getStream()); // count
+  encodeULEB128(0, getStream()); // memory index
+  write8(wasm::WASM_OPCODE_I32_CONST);
+  encodeSLEB128(0, getStream()); // offset
+  write8(wasm::WASM_OPCODE_END);
+  encodeULEB128(DataBytes.size(), getStream()); // size
+  uint32_t HeaderSize = getStream().tell() - Section.ContentsOffset;
+  writeBytes(DataBytes); // data
+
+  // Apply fixups.
+  ApplyRelocations(DataRelocations, getStream(), SymbolIndices,
+                   Section.ContentsOffset + HeaderSize);
+
+  endSection(Section);
+  return HeaderSize;
+}
+
+void WasmObjectWriter::writeNameSection(
+    const SmallVector<WasmFunction, 4> &Functions,
+    const SmallVector<WasmImport, 4> &Imports,
+    unsigned NumFuncImports) {
+  uint32_t TotalFunctions = NumFuncImports + Functions.size();
+  if (TotalFunctions == 0)
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "name");
+  SectionBookkeeping SubSection;
+  startSection(SubSection, wasm::WASM_NAMES_FUNCTION);
+
+  encodeULEB128(TotalFunctions, getStream());
+  uint32_t Index = 0;
+  for (const WasmImport &Import : Imports) {
+    if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+      encodeULEB128(Index, getStream());
+      encodeULEB128(Import.FieldName.size(), getStream());
+      writeBytes(Import.FieldName);
+      ++Index;
+    }
+  }
+  for (const WasmFunction &Func : Functions) {
+    encodeULEB128(Index, getStream());
+    encodeULEB128(Func.Sym->getName().size(), getStream());
+    writeBytes(Func.Sym->getName());
+    ++Index;
+  }
+
+  endSection(SubSection);
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeCodeRelocSection(
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices) {
+  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // for descriptions of the reloc sections.
+
+  if (CodeRelocations.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE");
+
+  encodeULEB128(wasm::WASM_SEC_CODE, getStream());
+  encodeULEB128(CodeRelocations.size() + TypeIndexFixups.size(), getStream());
+
+  WriteRelocations(CodeRelocations, getStream(), SymbolIndices, 0);
+  WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream());
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeDataRelocSection(
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices,
+    uint64_t DataSectionHeaderSize) {
+  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // for descriptions of the reloc sections.
+
+  if (DataRelocations.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA");
+
+  encodeULEB128(wasm::WASM_SEC_DATA, getStream());
+  encodeULEB128(DataRelocations.size(), getStream());
+
+  WriteRelocations(DataRelocations, getStream(), SymbolIndices,
+                   DataSectionHeaderSize);
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeLinkingMetaDataSection(
+    bool HasStackPointer, uint32_t StackPointerGlobal) {
+  if (!HasStackPointer)
+    return;
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
+
+  encodeULEB128(1, getStream()); // count
+
+  encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type
+  encodeULEB128(StackPointerGlobal, getStream()); // id
+
+  endSection(Section);
+}
+
 void WasmObjectWriter::writeObject(MCAssembler &Asm,
                                    const MCAsmLayout &Layout) {
   MCContext &Ctx = Asm.getContext();
@@ -730,16 +1110,21 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       if (IsAddressTaken.count(&WS))
         TableElems.push_back(Index);
     } else {
-      if (WS.getOffset() != 0)
-        report_fatal_error("data sections must contain one variable each");
-      if (!WS.getSize())
-        report_fatal_error("data symbols must have a size set with .size");
-
-      int64_t Size = 0;
-      if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
-        report_fatal_error(".size expression must be evaluatable");
+      if (WS.isTemporary() && !WS.getSize())
+        continue;
 
       if (WS.isDefined(false)) {
+        if (WS.getOffset() != 0)
+          report_fatal_error("data sections must contain one variable each: " +
+                             WS.getName());
+        if (!WS.getSize())
+          report_fatal_error("data symbols must have a size set with .size: " +
+                             WS.getName());
+
+        int64_t Size = 0;
+        if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
+          report_fatal_error(".size expression must be evaluatable");
+
         MCSectionWasm &DataSection =
             static_cast<MCSectionWasm &>(WS.getSection());
 
@@ -827,322 +1212,23 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
   // Write out the Wasm header.
   writeHeader(Asm);
 
-  SectionBookkeeping Section;
-
-  // === Type Section =========================================================
-  if (!FunctionTypes.empty()) {
-    startSection(Section, wasm::WASM_SEC_TYPE);
-
-    encodeULEB128(FunctionTypes.size(), getStream());
-
-    for (WasmFunctionType &FuncTy : FunctionTypes) {
-      encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream());
-      encodeULEB128(FuncTy.Params.size(), getStream());
-      for (wasm::ValType Ty : FuncTy.Params)
-        writeValueType(Ty);
-      encodeULEB128(FuncTy.Returns.size(), getStream());
-      for (wasm::ValType Ty : FuncTy.Returns)
-        writeValueType(Ty);
-    }
-
-    endSection(Section);
-  }
-
-  // === Import Section ========================================================
-  if (!Imports.empty()) {
-    startSection(Section, wasm::WASM_SEC_IMPORT);
-
-    encodeULEB128(Imports.size(), getStream());
-    for (const WasmImport &Import : Imports) {
-      StringRef ModuleName = Import.ModuleName;
-      encodeULEB128(ModuleName.size(), getStream());
-      writeBytes(ModuleName);
-
-      StringRef FieldName = Import.FieldName;
-      encodeULEB128(FieldName.size(), getStream());
-      writeBytes(FieldName);
-
-      encodeULEB128(Import.Kind, getStream());
-
-      switch (Import.Kind) {
-      case wasm::WASM_EXTERNAL_FUNCTION:
-        encodeULEB128(Import.Type, getStream());
-        break;
-      case wasm::WASM_EXTERNAL_GLOBAL:
-        encodeSLEB128(int32_t(Import.Type), getStream());
-        encodeULEB128(0, getStream()); // mutability
-        break;
-      default:
-        llvm_unreachable("unsupported import kind");
-      }
-    }
-
-    endSection(Section);
-  }
-
-  // === Function Section ======================================================
-  if (!Functions.empty()) {
-    startSection(Section, wasm::WASM_SEC_FUNCTION);
-
-    encodeULEB128(Functions.size(), getStream());
-    for (const WasmFunction &Func : Functions)
-      encodeULEB128(Func.Type, getStream());
-
-    endSection(Section);
-  }
-
-  // === Table Section =========================================================
-  // For now, always emit the table section, since indirect calls are not
-  // valid without it. In the future, we could perhaps be more clever and omit
-  // it if there are no indirect calls.
-  startSection(Section, wasm::WASM_SEC_TABLE);
-
-  // The number of tables, fixed to 1 for now.
-  encodeULEB128(1, getStream());
-
-  encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream());
-
-  encodeULEB128(0, getStream());                 // flags
-  encodeULEB128(TableElems.size(), getStream()); // initial
-
-  endSection(Section);
-
-  // === Memory Section ========================================================
-  // For now, always emit the memory section, since loads and stores are not
-  // valid without it. In the future, we could perhaps be more clever and omit
-  // it if there are no loads or stores.
-  uint32_t NumPages =
-      (DataBytes.size() + wasm::WasmPageSize - 1) / wasm::WasmPageSize;
-
-  startSection(Section, wasm::WASM_SEC_MEMORY);
-  encodeULEB128(1, getStream()); // number of memory spaces
-
-  encodeULEB128(0, getStream()); // flags
-  encodeULEB128(NumPages, getStream()); // initial
-
-  endSection(Section);
-
-  // === Global Section ========================================================
-  if (!Globals.empty()) {
-    startSection(Section, wasm::WASM_SEC_GLOBAL);
-
-    encodeULEB128(Globals.size(), getStream());
-    for (const WasmGlobal &Global : Globals) {
-      writeValueType(Global.Type);
-      write8(Global.IsMutable);
-
-      if (Global.HasImport) {
-        assert(Global.InitialValue == 0);
-        write8(wasm::WASM_OPCODE_GET_GLOBAL);
-        encodeULEB128(Global.ImportIndex, getStream());
-      } else {
-        assert(Global.ImportIndex == 0);
-        write8(wasm::WASM_OPCODE_I32_CONST);
-        encodeSLEB128(Global.InitialValue, getStream()); // offset
-      }
-      write8(wasm::WASM_OPCODE_END);
-    }
-
-    endSection(Section);
-  }
-
-  // === Export Section ========================================================
-  if (!Exports.empty()) {
-    startSection(Section, wasm::WASM_SEC_EXPORT);
-
-    encodeULEB128(Exports.size(), getStream());
-    for (const WasmExport &Export : Exports) {
-      encodeULEB128(Export.FieldName.size(), getStream());
-      writeBytes(Export.FieldName);
-
-      encodeSLEB128(Export.Kind, getStream());
-
-      encodeULEB128(Export.Index, getStream());
-    }
-
-    endSection(Section);
-  }
-
-#if 0 // TODO: Start Section
-  if (HaveStartFunction) {
-    // === Start Section =========================================================
-    startSection(Section, wasm::WASM_SEC_START);
-
-    encodeSLEB128(StartFunction, getStream());
-
-    endSection(Section);
-  }
-#endif
-
-  // === Elem Section ==========================================================
-  if (!TableElems.empty()) {
-    startSection(Section, wasm::WASM_SEC_ELEM);
-
-    encodeULEB128(1, getStream()); // number of "segments"
-    encodeULEB128(0, getStream()); // the table index
-
-    // init expr for starting offset
-    write8(wasm::WASM_OPCODE_I32_CONST);
-    encodeSLEB128(0, getStream());
-    write8(wasm::WASM_OPCODE_END);
-
-    encodeULEB128(TableElems.size(), getStream());
-    for (uint32_t Elem : TableElems)
-      encodeULEB128(Elem, getStream());
-
-    endSection(Section);
-  }
-
-  // === Code Section ==========================================================
-  if (!Functions.empty()) {
-    startSection(Section, wasm::WASM_SEC_CODE);
-
-    encodeULEB128(Functions.size(), getStream());
-
-    for (const WasmFunction &Func : Functions) {
-      MCSectionWasm &FuncSection =
-          static_cast<MCSectionWasm &>(Func.Sym->getSection());
-
-      if (Func.Sym->isVariable())
-        report_fatal_error("weak symbols not supported yet");
-
-      if (Func.Sym->getOffset() != 0)
-        report_fatal_error("function sections must contain one function each");
-
-      if (!Func.Sym->getSize())
-        report_fatal_error("function symbols must have a size set with .size");
-
-      int64_t Size = 0;
-      if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
-        report_fatal_error(".size expression must be evaluatable");
-
-      encodeULEB128(Size, getStream());
-
-      FuncSection.setSectionOffset(getStream().tell() -
-                                   Section.ContentsOffset);
-
-      Asm.writeSectionData(&FuncSection, Layout);
-    }
-
-    // Apply the type index fixups for call_indirect etc. instructions.
-    for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) {
-      uint32_t Type = TypeIndexFixupTypes[i];
-      unsigned Padding = PaddingFor5ByteULEB128(Type);
-
-      const WasmRelocationEntry &Fixup = TypeIndexFixups[i];
-      assert(Fixup.Addend == 0);
-      assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
-      uint64_t Offset = Fixup.Offset +
-                        Fixup.FixupSection->getSectionOffset();
-
-      uint8_t Buffer[16];
-      unsigned SizeLen = encodeULEB128(Type, Buffer, Padding);
-      assert(SizeLen == 5);
-      getStream().pwrite((char *)Buffer, SizeLen,
-                         Section.ContentsOffset + Offset);
-    }
-
-    // Apply fixups.
-    ApplyRelocations(CodeRelocations, getStream(), SymbolIndices,
-                     Section.ContentsOffset);
-
-    endSection(Section);
-  }
-
-  // === Data Section ==========================================================
-  uint32_t DataSectionHeaderSize = 0;
-  if (!DataBytes.empty()) {
-    startSection(Section, wasm::WASM_SEC_DATA);
-
-    encodeULEB128(1, getStream()); // count
-    encodeULEB128(0, getStream()); // memory index
-    write8(wasm::WASM_OPCODE_I32_CONST);
-    encodeSLEB128(0, getStream()); // offset
-    write8(wasm::WASM_OPCODE_END);
-    encodeULEB128(DataBytes.size(), getStream()); // size
-    DataSectionHeaderSize = getStream().tell() - Section.ContentsOffset;
-    writeBytes(DataBytes); // data
-
-    // Apply fixups.
-    ApplyRelocations(DataRelocations, getStream(), SymbolIndices,
-                     Section.ContentsOffset + DataSectionHeaderSize);
-
-    endSection(Section);
-  }
-
-  // === Name Section ==========================================================
-  uint32_t TotalFunctions = NumFuncImports + Functions.size();
-  if (TotalFunctions != 0) {
-    startSection(Section, wasm::WASM_SEC_CUSTOM, "name");
-    SectionBookkeeping SubSection;
-    startSection(SubSection, wasm::WASM_NAMES_FUNCTION);
-
-    encodeULEB128(TotalFunctions, getStream());
-    uint32_t Index = 0;
-    for (const WasmImport &Import : Imports) {
-      if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
-        encodeULEB128(Index, getStream());
-        encodeULEB128(Import.FieldName.size(), getStream());
-        writeBytes(Import.FieldName);
-        ++Index;
-      }
-    }
-    for (const WasmFunction &Func : Functions) {
-      encodeULEB128(Index, getStream());
-      encodeULEB128(Func.Sym->getName().size(), getStream());
-      writeBytes(Func.Sym->getName());
-      ++Index;
-    }
-
-    endSection(SubSection);
-    endSection(Section);
-  }
-
-  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
-  // for descriptions of the reloc sections.
-
-  // === Code Reloc Section ====================================================
-  if (!CodeRelocations.empty()) {
-    startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE");
-
-    encodeULEB128(wasm::WASM_SEC_CODE, getStream());
-
-    encodeULEB128(CodeRelocations.size() + TypeIndexFixups.size(), getStream());
-
-    WriteRelocations(CodeRelocations, getStream(), SymbolIndices, 0);
-    WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream());
-
-    endSection(Section);
-  }
-
-  // === Data Reloc Section ====================================================
-  if (!DataRelocations.empty()) {
-    startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA");
-
-    encodeULEB128(wasm::WASM_SEC_DATA, getStream());
-
-    encodeULEB128(DataRelocations.size(), getStream());
-
-    WriteRelocations(DataRelocations, getStream(), SymbolIndices,
-                     DataSectionHeaderSize);
-
-    endSection(Section);
-  }
-
-  // === Linking Metadata Section ==============================================
-  if (HasStackPointer) {
-    startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
-
-    encodeULEB128(1, getStream()); // count
-
-    encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type
-    encodeULEB128(StackPointerGlobal, getStream()); // id
-
-    endSection(Section);
-  }
+  writeTypeSection(FunctionTypes);
+  writeImportSection(Imports);
+  writeFunctionSection(Functions);
+  writeTableSection(TableElems);
+  writeMemorySection(DataBytes);
+  writeGlobalSection(Globals);
+  writeExportSection(Exports);
+  // TODO: Start Section
+  writeElemSection(TableElems);
+  writeCodeSection(Asm, Layout, SymbolIndices, Functions);
+  uint64_t DataSectionHeaderSize = writeDataSection(DataBytes, SymbolIndices);
+  writeNameSection(Functions, Imports, NumFuncImports);
+  writeCodeRelocSection(SymbolIndices);
+  writeDataRelocSection(SymbolIndices, DataSectionHeaderSize);
+  writeLinkingMetaDataSection(HasStackPointer, StackPointerGlobal);
 
   // TODO: Translate the .comment section to the output.
-
   // TODO: Translate debug sections to the output.
 }
 
diff --git a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
index f652ff57f30d..21d29835624e 100644
--- a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
@@ -17,6 +17,11 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 
@@ -36,16 +41,80 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef)
 
 LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false)
+LLVM_YAML_DECLARE_ENUM_TRAITS(DebugSubsectionKind)
 LLVM_YAML_DECLARE_ENUM_TRAITS(FileChecksumKind)
 LLVM_YAML_DECLARE_BITSET_TRAITS(LineFlags)
 
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineEntry)
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceColumnEntry)
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileChecksumEntry)
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineInfo)
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineBlock)
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeInfo)
-LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeSite)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceLineEntry)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceColumnEntry)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceFileChecksumEntry)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceLineBlock)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(InlineeSite)
+
+namespace llvm {
+namespace CodeViewYAML {
+namespace detail {
+struct YAMLSubsectionBase {
+  explicit YAMLSubsectionBase(DebugSubsectionKind Kind) : Kind(Kind) {}
+  DebugSubsectionKind Kind;
+  virtual ~YAMLSubsectionBase() {}
+
+  virtual void map(IO &IO) = 0;
+  virtual std::unique_ptr<DebugSubsection>
+  toCodeViewSubsection(DebugStringTableSubsection *UseStrings,
+                       DebugChecksumsSubsection *UseChecksums) const = 0;
+};
+}
+}
+}
+
+namespace {
+struct YAMLChecksumsSubsection : public YAMLSubsectionBase {
+  YAMLChecksumsSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::FileChecksums) {}
+
+  void map(IO &IO) override;
+  std::unique_ptr<DebugSubsection>
+  toCodeViewSubsection(DebugStringTableSubsection *Strings,
+                       DebugChecksumsSubsection *Checksums) const override;
+  static Expected<std::shared_ptr<YAMLChecksumsSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugChecksumsSubsectionRef &FC);
+
+  std::vector<SourceFileChecksumEntry> Checksums;
+};
+
+struct YAMLLinesSubsection : public YAMLSubsectionBase {
+  YAMLLinesSubsection() : YAMLSubsectionBase(DebugSubsectionKind::Lines) {}
+
+  void map(IO &IO) override;
+  std::unique_ptr<DebugSubsection>
+  toCodeViewSubsection(DebugStringTableSubsection *Strings,
+                       DebugChecksumsSubsection *Checksums) const override;
+  static Expected<std::shared_ptr<YAMLLinesSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugChecksumsSubsectionRef &Checksums,
+                         const DebugLinesSubsectionRef &Lines);
+
+  SourceLineInfo Lines;
+};
+
+struct YAMLInlineeLinesSubsection : public YAMLSubsectionBase {
+  YAMLInlineeLinesSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::InlineeLines) {}
+
+  void map(IO &IO) override;
+  std::unique_ptr<DebugSubsection>
+  toCodeViewSubsection(DebugStringTableSubsection *Strings,
+                       DebugChecksumsSubsection *Checksums) const override;
+  static Expected<std::shared_ptr<YAMLInlineeLinesSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugChecksumsSubsectionRef &Checksums,
+                         const DebugInlineeLinesSubsectionRef &Lines);
+
+  InlineeInfo InlineeLines;
+};
+}
 
 void ScalarBitSetTraits<LineFlags>::bitset(IO &io, LineFlags &Flags) {
   io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns);
@@ -99,21 +168,6 @@ void MappingTraits<SourceFileChecksumEntry>::mapping(
   IO.mapRequired("Checksum", Obj.ChecksumBytes);
 }
 
-void MappingTraits<SourceLineInfo>::mapping(IO &IO, SourceLineInfo &Obj) {
-  IO.mapRequired("CodeSize", Obj.CodeSize);
-
-  IO.mapRequired("Flags", Obj.Flags);
-  IO.mapRequired("RelocOffset", Obj.RelocOffset);
-  IO.mapRequired("RelocSegment", Obj.RelocSegment);
-  IO.mapRequired("Blocks", Obj.Blocks);
-}
-
-void MappingTraits<SourceFileInfo>::mapping(IO &IO, SourceFileInfo &Obj) {
-  IO.mapOptional("Checksums", Obj.FileChecksums);
-  IO.mapOptional("Lines", Obj.LineFragments);
-  IO.mapOptional("InlineeLines", Obj.Inlinees);
-}
-
 void MappingTraits<InlineeSite>::mapping(IO &IO, InlineeSite &Obj) {
   IO.mapRequired("FileName", Obj.FileName);
   IO.mapRequired("LineNum", Obj.SourceLineNum);
@@ -121,7 +175,310 @@ void MappingTraits<InlineeSite>::mapping(IO &IO, InlineeSite &Obj) {
   IO.mapOptional("ExtraFiles", Obj.ExtraFiles);
 }
 
-void MappingTraits<InlineeInfo>::mapping(IO &IO, InlineeInfo &Obj) {
-  IO.mapRequired("HasExtraFiles", Obj.HasExtraFiles);
-  IO.mapRequired("Sites", Obj.Sites);
+void YAMLChecksumsSubsection::map(IO &IO) {
+  IO.mapTag("!FileChecksums", true);
+  IO.mapRequired("Checksums", Checksums);
+}
+
+void YAMLLinesSubsection::map(IO &IO) {
+  IO.mapTag("!Lines", true);
+  IO.mapRequired("CodeSize", Lines.CodeSize);
+
+  IO.mapRequired("Flags", Lines.Flags);
+  IO.mapRequired("RelocOffset", Lines.RelocOffset);
+  IO.mapRequired("RelocSegment", Lines.RelocSegment);
+  IO.mapRequired("Blocks", Lines.Blocks);
+}
+
+void YAMLInlineeLinesSubsection::map(IO &IO) {
+  IO.mapTag("!InlineeLines", true);
+  IO.mapRequired("HasExtraFiles", InlineeLines.HasExtraFiles);
+  IO.mapRequired("Sites", InlineeLines.Sites);
+}
+
+void MappingTraits<YAMLDebugSubsection>::mapping(
+    IO &IO, YAMLDebugSubsection &Subsection) {
+  if (!IO.outputting()) {
+    if (IO.mapTag("!FileChecksums")) {
+      auto SS = std::make_shared<YAMLChecksumsSubsection>();
+      Subsection.Subsection = SS;
+    } else if (IO.mapTag("!Lines")) {
+      Subsection.Subsection = std::make_shared<YAMLLinesSubsection>();
+    } else if (IO.mapTag("!InlineeLines")) {
+      Subsection.Subsection = std::make_shared<YAMLInlineeLinesSubsection>();
+    } else {
+      llvm_unreachable("Unexpected subsection tag!");
+    }
+  }
+  Subsection.Subsection->map(IO);
+}
+
+static Expected<const YAMLChecksumsSubsection &>
+findChecksums(ArrayRef<YAMLDebugSubsection> Subsections) {
+  for (const auto &SS : Subsections) {
+    if (SS.Subsection->Kind == DebugSubsectionKind::FileChecksums) {
+      return static_cast<const YAMLChecksumsSubsection &>(*SS.Subsection);
+    }
+  }
+  return make_error<CodeViewError>(cv_error_code::no_records);
+}
+
+std::unique_ptr<DebugSubsection> YAMLChecksumsSubsection::toCodeViewSubsection(
+    DebugStringTableSubsection *UseStrings,
+    DebugChecksumsSubsection *UseChecksums) const {
+  assert(UseStrings && !UseChecksums);
+  auto Result = llvm::make_unique<DebugChecksumsSubsection>(*UseStrings);
+  for (const auto &CS : Checksums) {
+    Result->addChecksum(CS.FileName, CS.Kind, CS.ChecksumBytes.Bytes);
+  }
+  return std::move(Result);
+}
+
+std::unique_ptr<DebugSubsection> YAMLLinesSubsection::toCodeViewSubsection(
+    DebugStringTableSubsection *UseStrings,
+    DebugChecksumsSubsection *UseChecksums) const {
+  assert(UseStrings && UseChecksums);
+  auto Result =
+      llvm::make_unique<DebugLinesSubsection>(*UseChecksums, *UseStrings);
+  Result->setCodeSize(Lines.CodeSize);
+  Result->setRelocationAddress(Lines.RelocSegment, Lines.RelocOffset);
+  Result->setFlags(Lines.Flags);
+  for (const auto &LC : Lines.Blocks) {
+    Result->createBlock(LC.FileName);
+    if (Result->hasColumnInfo()) {
+      for (const auto &Item : zip(LC.Lines, LC.Columns)) {
+        auto &L = std::get<0>(Item);
+        auto &C = std::get<1>(Item);
+        uint32_t LE = L.LineStart + L.EndDelta;
+        Result->addLineAndColumnInfo(L.Offset,
+                                     LineInfo(L.LineStart, LE, L.IsStatement),
+                                     C.StartColumn, C.EndColumn);
+      }
+    } else {
+      for (const auto &L : LC.Lines) {
+        uint32_t LE = L.LineStart + L.EndDelta;
+        Result->addLineInfo(L.Offset, LineInfo(L.LineStart, LE, L.IsStatement));
+      }
+    }
+  }
+  return llvm::cast<DebugSubsection>(std::move(Result));
+}
+
+std::unique_ptr<DebugSubsection>
+YAMLInlineeLinesSubsection::toCodeViewSubsection(
+    DebugStringTableSubsection *UseStrings,
+    DebugChecksumsSubsection *UseChecksums) const {
+  assert(UseChecksums);
+  auto Result = llvm::make_unique<DebugInlineeLinesSubsection>(
+      *UseChecksums, InlineeLines.HasExtraFiles);
+
+  for (const auto &Site : InlineeLines.Sites) {
+    Result->addInlineSite(TypeIndex(Site.Inlinee), Site.FileName,
+                          Site.SourceLineNum);
+    if (!InlineeLines.HasExtraFiles)
+      continue;
+
+    for (auto EF : Site.ExtraFiles) {
+      Result->addExtraFile(EF);
+    }
+  }
+  return llvm::cast<DebugSubsection>(std::move(Result));
+}
+
+static Expected<SourceFileChecksumEntry>
+convertOneChecksum(const DebugStringTableSubsectionRef &Strings,
+                   const FileChecksumEntry &CS) {
+  auto ExpectedString = Strings.getString(CS.FileNameOffset);
+  if (!ExpectedString)
+    return ExpectedString.takeError();
+
+  SourceFileChecksumEntry Result;
+  Result.ChecksumBytes.Bytes = CS.Checksum;
+  Result.Kind = CS.Kind;
+  Result.FileName = *ExpectedString;
+  return Result;
+}
+
+static Expected<StringRef>
+getFileName(const DebugStringTableSubsectionRef &Strings,
+            const DebugChecksumsSubsectionRef &Checksums, uint32_t FileID) {
+  auto Iter = Checksums.getArray().at(FileID);
+  if (Iter == Checksums.getArray().end())
+    return make_error<CodeViewError>(cv_error_code::no_records);
+  uint32_t Offset = Iter->FileNameOffset;
+  return Strings.getString(Offset);
+}
+
+Expected<std::shared_ptr<YAMLChecksumsSubsection>>
+YAMLChecksumsSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &FC) {
+  auto Result = std::make_shared<YAMLChecksumsSubsection>();
+
+  for (const auto &CS : FC) {
+    auto ConvertedCS = convertOneChecksum(Strings, CS);
+    if (!ConvertedCS)
+      return ConvertedCS.takeError();
+    Result->Checksums.push_back(*ConvertedCS);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLLinesSubsection>>
+YAMLLinesSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &Checksums,
+    const DebugLinesSubsectionRef &Lines) {
+  auto Result = std::make_shared<YAMLLinesSubsection>();
+  Result->Lines.CodeSize = Lines.header()->CodeSize;
+  Result->Lines.RelocOffset = Lines.header()->RelocOffset;
+  Result->Lines.RelocSegment = Lines.header()->RelocSegment;
+  Result->Lines.Flags = static_cast<LineFlags>(uint16_t(Lines.header()->Flags));
+  for (const auto &L : Lines) {
+    SourceLineBlock Block;
+    auto EF = getFileName(Strings, Checksums, L.NameIndex);
+    if (!EF)
+      return EF.takeError();
+    Block.FileName = *EF;
+    if (Lines.hasColumnInfo()) {
+      for (const auto &C : L.Columns) {
+        SourceColumnEntry SCE;
+        SCE.EndColumn = C.EndColumn;
+        SCE.StartColumn = C.StartColumn;
+        Block.Columns.push_back(SCE);
+      }
+    }
+    for (const auto &LN : L.LineNumbers) {
+      SourceLineEntry SLE;
+      LineInfo LI(LN.Flags);
+      SLE.Offset = LN.Offset;
+      SLE.LineStart = LI.getStartLine();
+      SLE.EndDelta = LI.getLineDelta();
+      SLE.IsStatement = LI.isStatement();
+      Block.Lines.push_back(SLE);
+    }
+    Result->Lines.Blocks.push_back(Block);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLInlineeLinesSubsection>>
+YAMLInlineeLinesSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &Checksums,
+    const DebugInlineeLinesSubsectionRef &Lines) {
+  auto Result = std::make_shared<YAMLInlineeLinesSubsection>();
+
+  Result->InlineeLines.HasExtraFiles = Lines.hasExtraFiles();
+  for (const auto &IL : Lines) {
+    InlineeSite Site;
+    auto ExpF = getFileName(Strings, Checksums, IL.Header->FileID);
+    if (!ExpF)
+      return ExpF.takeError();
+    Site.FileName = *ExpF;
+    Site.Inlinee = IL.Header->Inlinee.getIndex();
+    Site.SourceLineNum = IL.Header->SourceLineNum;
+    if (Lines.hasExtraFiles()) {
+      for (const auto EF : IL.ExtraFiles) {
+        auto ExpF2 = getFileName(Strings, Checksums, EF);
+        if (!ExpF2)
+          return ExpF2.takeError();
+        Site.ExtraFiles.push_back(*ExpF2);
+      }
+    }
+    Result->InlineeLines.Sites.push_back(Site);
+  }
+  return Result;
+}
+
+Expected<std::vector<std::unique_ptr<DebugSubsection>>>
+llvm::CodeViewYAML::convertSubsectionList(
+    ArrayRef<YAMLDebugSubsection> Subsections,
+    DebugStringTableSubsection &Strings) {
+  std::vector<std::unique_ptr<DebugSubsection>> Result;
+  if (Subsections.empty())
+    return std::move(Result);
+
+  auto Checksums = findChecksums(Subsections);
+  if (!Checksums)
+    return Checksums.takeError();
+  auto ChecksumsBase = Checksums->toCodeViewSubsection(&Strings, nullptr);
+  DebugChecksumsSubsection &CS =
+      llvm::cast<DebugChecksumsSubsection>(*ChecksumsBase);
+  for (const auto &SS : Subsections) {
+    // We've already converted the checksums subsection, don't do it
+    // twice.
+    std::unique_ptr<DebugSubsection> CVS;
+    if (SS.Subsection->Kind == DebugSubsectionKind::FileChecksums)
+      CVS = std::move(ChecksumsBase);
+    else
+      CVS = SS.Subsection->toCodeViewSubsection(&Strings, &CS);
+    Result.push_back(std::move(CVS));
+  }
+  return std::move(Result);
+}
+
+namespace {
+struct SubsectionConversionVisitor : public DebugSubsectionVisitor {
+  explicit SubsectionConversionVisitor(
+      const DebugStringTableSubsectionRef &Strings,
+      const DebugChecksumsSubsectionRef &Checksums)
+      : Strings(Strings), Checksums(Checksums) {}
+
+  Error visitUnknown(DebugUnknownSubsectionRef &Unknown) override;
+  Error visitLines(DebugLinesSubsectionRef &Lines) override;
+  Error visitFileChecksums(DebugChecksumsSubsectionRef &Checksums) override;
+  Error visitInlineeLines(DebugInlineeLinesSubsectionRef &Inlinees) override;
+
+  YAMLDebugSubsection Subsection;
+
+private:
+  const DebugStringTableSubsectionRef &Strings;
+  const DebugChecksumsSubsectionRef &Checksums;
+};
+
+Error SubsectionConversionVisitor::visitUnknown(
+    DebugUnknownSubsectionRef &Unknown) {
+  return make_error<CodeViewError>(cv_error_code::operation_unsupported);
+}
+
+Error SubsectionConversionVisitor::visitLines(DebugLinesSubsectionRef &Lines) {
+  auto Result =
+      YAMLLinesSubsection::fromCodeViewSubsection(Strings, Checksums, Lines);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitFileChecksums(
+    DebugChecksumsSubsectionRef &Checksums) {
+  auto Result =
+      YAMLChecksumsSubsection::fromCodeViewSubsection(Strings, Checksums);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitInlineeLines(
+    DebugInlineeLinesSubsectionRef &Inlinees) {
+  auto Result = YAMLInlineeLinesSubsection::fromCodeViewSubsection(
+      Strings, Checksums, Inlinees);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+}
+
+Expected<YAMLDebugSubsection> YAMLDebugSubsection::fromCodeViewSubection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &Checksums,
+    const DebugSubsectionRecord &SS) {
+  SubsectionConversionVisitor V(Strings, Checksums);
+  if (auto EC = visitDebugSubsection(SS, V))
+    return std::move(EC);
+
+  return V.Subsection;
 }
diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 6e8bb5c7372c..bd97af3a9323 100644
--- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -148,7 +148,8 @@ struct SymbolRecordBase {
   virtual ~SymbolRecordBase() {}
   virtual void map(yaml::IO &io) = 0;
   virtual codeview::CVSymbol
-  toCodeViewSymbol(BumpPtrAllocator &Allocator) const = 0;
+  toCodeViewSymbol(BumpPtrAllocator &Allocator,
+                   CodeViewContainer Container) const = 0;
   virtual Error fromCodeViewSymbol(codeview::CVSymbol Type) = 0;
 };
 
@@ -159,8 +160,9 @@ template <typename T> struct SymbolRecordImpl : public SymbolRecordBase {
   void map(yaml::IO &io) override;
 
   codeview::CVSymbol
-  toCodeViewSymbol(BumpPtrAllocator &Allocator) const override {
-    return SymbolSerializer::writeOneSymbol(Symbol, Allocator);
+  toCodeViewSymbol(BumpPtrAllocator &Allocator,
+                   CodeViewContainer Container) const override {
+    return SymbolSerializer::writeOneSymbol(Symbol, Allocator, Container);
   }
   Error fromCodeViewSymbol(codeview::CVSymbol CVS) override {
     return SymbolDeserializer::deserializeAs<T>(CVS, Symbol);
@@ -429,8 +431,8 @@ template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) {
 }
 
 CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol(
-    BumpPtrAllocator &Allocator) const {
-  return Symbol->toCodeViewSymbol(Allocator);
+    BumpPtrAllocator &Allocator, CodeViewContainer Container) const {
+  return Symbol->toCodeViewSymbol(Allocator, Container);
 }
 
 namespace llvm {
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index eb81e58b9b0e..17c60348633c 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -310,6 +310,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // Catch trivial redundancies
   FPM.addPass(EarlyCSEPass());
 
+  // Hoisting of scalars and load expressions.
+  if (EnableGVNHoist)
+    FPM.addPass(GVNHoistPass());
+
   // Speculative execution if the target has divergent branches; otherwise nop.
   FPM.addPass(SpeculativeExecutionPass());
 
@@ -473,8 +477,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   EarlyFPM.addPass(SROA());
   EarlyFPM.addPass(EarlyCSEPass());
   EarlyFPM.addPass(LowerExpectIntrinsicPass());
-  if (EnableGVNHoist)
-    EarlyFPM.addPass(GVNHoistPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
 
   // Interprocedural constant propagation now that basic cleanup has occured
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 318e21da999d..f7b7ad89e959 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -649,12 +649,10 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumbeb:
-  case Triple::xcore:
-    return Triple::ELF;
-
   case Triple::wasm32:
   case Triple::wasm64:
-    return Triple::Wasm;
+  case Triple::xcore:
+    return Triple::ELF;
 
   case Triple::ppc:
   case Triple::ppc64:
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
index 4f656f94ea12..b99c1d1d6b3e 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.h
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -1,4 +1,4 @@
-//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,8 @@
 
 namespace llvm {
 
+class TargetRegisterInfo;
+
 /// Add the accumulator chaining constraint to a PBQP graph
 class A57ChainingConstraint : public PBQPRAConstraint {
 public:
@@ -33,6 +35,7 @@ private:
   // Add constraints between existing chains
   void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
 };
-}
+
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index d098cf7a5a37..7402bcf1346c 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -56,12 +56,14 @@ def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
 def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
 def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
 def FalkorWr_1LD_4cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 4; }
+def FalkorWr_1XYZ_0cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 0; }
 def FalkorWr_1XYZ_1cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
 def FalkorWr_1XYZ_2cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
 def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
 def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
 def FalkorWr_1none_0cyc : SchedWriteRes<[]>              { let Latency = 0; }
 
+def FalkorWr_1VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 0; }
 def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
 def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
 def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
@@ -76,6 +78,7 @@ def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
 def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
 def FalkorWr_1ST_3cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 3; }
 
+def FalkorWr_1GTOV_0cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 0; }
 def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
 def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
 def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
@@ -83,6 +86,10 @@ def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
 //===----------------------------------------------------------------------===//
 // Define 2 micro-op types
 
+def FalkorWr_2VXVY_0cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
 def FalkorWr_2VXVY_1cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 1;
   let NumMicroOps = 2;
@@ -476,17 +483,19 @@ def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr
 // SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
 // -----------------------------------------------------------------------------
 def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
-def FalkorFMOVZrReg   : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
+def FalkorOp1ZrReg    : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
+
                                          MI->getOperand(1).getReg() == AArch64::XZR}]>;
 def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>;
 
 def FalkorWr_FMOV  : SchedWriteVariant<[
-                       SchedVar<FalkorFMOVZrReg, [FalkorWr_1none_0cyc]>,
+                       SchedVar<FalkorOp1ZrReg,  [FalkorWr_1none_0cyc]>,
                        SchedVar<NoSchedPred,     [FalkorWr_1GTOV_1cyc]>]>;
 
 def FalkorWr_MOVZ  : SchedWriteVariant<[
                        SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
-                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_1cyc]>]>;
+                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_0cyc]>]>; // imm fwd
+
 
 def FalkorWr_ADDSUBsx : SchedWriteVariant<[
                           SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>,
@@ -500,6 +509,10 @@ def FalkorWr_LDRSro : SchedWriteVariant<[
                         SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>,
                         SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1LD_5cyc]>]>;
 
+def FalkorWr_ORRi : SchedWriteVariant<[
+                      SchedVar<FalkorOp1ZrReg, [FalkorWr_1XYZ_0cyc]>, // imm fwd
+                      SchedVar<NoSchedPred,    [FalkorWr_1XYZ_1cyc]>]>;
+
 def FalkorWr_PRFMro : SchedWriteVariant<[
                         SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>,
                         SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1ST_4cyc]>]>;
@@ -810,7 +823,8 @@ def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^BIC(S)?(W|X)r(r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^EON(W|X)r(r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^EOR(W|X)r(i|r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORN(W|X)r(r|s)$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORR(W|X)r(i|r|s)$")>;
+def : InstRW<[FalkorWr_ORRi],         (instregex "^ORR(W|X)ri$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORR(W|X)r(r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^SBC(S)?(W|X)r$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^SUB(S)?(W|X)r(r|i)$")>;
 def : InstRW<[FalkorWr_ADDSUBsx],     (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>;
@@ -825,7 +839,7 @@ def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>;
+def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs NOTv8i8)>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^REV(16|32|64)v.*$")>;
@@ -849,7 +863,7 @@ def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v16i8$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
-def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>;
+def : InstRW<[FalkorWr_2VXVY_0cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs TBLv16i8One)>;
 
@@ -1036,13 +1050,13 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFM
 // FP Miscellaneous Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(WS|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^FMOV(S|D)i$")>;
+def : InstRW<[FalkorWr_1GTOV_0cyc],   (instregex "^FMOV(S|D)i$")>; // imm fwd
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(d|s)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(SW|DX|DXHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Sr|Dr|v.*_ns)$")>;
+def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; // imm fwd
 // FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr
-def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs FMOVD0, FMOVS0)>;
+def : InstRW<[FalkorWr_2VXVY_0cyc],   (instrs FMOVD0, FMOVS0)>; // imm fwd
 
 def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
@@ -1107,11 +1121,12 @@ def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
 
 // Move and Shift Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|X).*")>;
-def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^ADRP?$")>;
-def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^MOVN(W|X)i$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(LSLV|LSRV|ASRV|RORV)(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_0cyc],    (instregex "^MOVK(W|X)i$")>; // imm fwd
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instregex "^ADRP?$")>; // imm fwd
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instregex "^MOVN(W|X)i$")>; // imm fwd
 def : InstRW<[FalkorWr_MOVZ],         (instregex "^MOVZ(W|X)i$")>;
-def : InstRW<[FalkorWr_1XYZ_1cyc],    (instrs MOVi32imm, MOVi64imm)>;
+def : InstRW<[FalkorWr_1XYZ_0cyc],    (instrs MOVi32imm, MOVi64imm)>; // imm fwd (approximation)
 def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>],
                                       (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>;
 def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>],
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 78ff3bbe3d1a..55d18c3f3646 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -55,6 +55,8 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
 extern char &AMDGPUMachineCFGStructurizerID;
 
+void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
+
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index e7ebb37a9d62..b50e8d1d659e 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -365,6 +365,13 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
   "Force to generate flat instruction for global"
 >;
 
+def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
+  "auto-waitcnt-before-barrier",
+  "AutoWaitcntBeforeBarrier",
+  "true",
+  "Hardware automatically inserts waitcnt before barrier"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 1d03714874e2..8084d368c80f 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -22,18 +22,22 @@ using namespace llvm;
 namespace {
 
 class AMDGPUAlwaysInline : public ModulePass {
-  static char ID;
-
   bool GlobalOpt;
 
 public:
-  AMDGPUAlwaysInline(bool GlobalOpt) : ModulePass(ID), GlobalOpt(GlobalOpt) { }
+  static char ID;
+
+  AMDGPUAlwaysInline(bool GlobalOpt = false) :
+    ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
 };
 
 } // End anonymous namespace
 
+INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
+                "AMDGPU Inline All Functions", false, false)
+
 char AMDGPUAlwaysInline::ID = 0;
 
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 57905be18813..267f4807a788 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -28,11 +28,16 @@ using namespace llvm;
 AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   using namespace TargetOpcode;
 
+  const LLT S1= LLT::scalar(1);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
   const LLT P1 = LLT::pointer(1, 64);
   const LLT P2 = LLT::pointer(2, 64);
 
+  // FIXME: i1 operands to intrinsics should always be legal, but other i1
+  // values may not be legal.  We need to figure out how to distinguish
+  // between these two scenarios.
+  setAction({G_CONSTANT, S1}, Legal);
   setAction({G_CONSTANT, S32}, Legal);
   setAction({G_CONSTANT, S64}, Legal);
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 6e301b4ad527..8d157e2f98f2 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -91,6 +91,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FPExceptions(false),
     DX10Clamp(false),
     FlatForGlobal(false),
+    AutoWaitcntBeforeBarrier(false),
     UnalignedScratchAccess(false),
     UnalignedBufferAccess(false),
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 0582ce95693a..ed9cbb994fad 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -110,6 +110,7 @@ protected:
   bool FPExceptions;
   bool DX10Clamp;
   bool FlatForGlobal;
+  bool AutoWaitcntBeforeBarrier;
   bool UnalignedScratchAccess;
   bool UnalignedBufferAccess;
   bool HasApertureRegs;
@@ -195,7 +196,8 @@ public:
   }
 
   bool isOpenCLEnv() const {
-    return TargetTriple.getEnvironment() == Triple::OpenCL;
+    return TargetTriple.getEnvironment() == Triple::OpenCL ||
+           TargetTriple.getEnvironmentName() == "amdgizcl";
   }
 
   Generation getGeneration() const {
@@ -363,6 +365,10 @@ public:
     return FlatForGlobal;
   }
 
+  bool hasAutoWaitcntBeforeBarrier() const {
+    return AutoWaitcntBeforeBarrier;
+  }
+
   bool hasUnalignedBufferAccess() const {
     return UnalignedBufferAccess;
   }
@@ -727,12 +733,6 @@ public:
   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
-  /// \returns True if waitcnt instruction is needed before barrier instruction,
-  /// false otherwise.
-  bool needWaitcntBeforeBarrier() const {
-    return true;
-  }
-
   /// \returns true if the flat_scratch register should be initialized with the
   /// pointer to the wave's scratch memory rather than a size and offset.
   bool flatScratchIsPointer() const {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 596f02ae4a64..404598ff4738 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -116,7 +116,7 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
 static cl::opt<bool> EnableSIInsertWaitcntsPass(
   "enable-si-insert-waitcnts",
   cl::desc("Use new waitcnt insertion pass"),
-  cl::init(false));
+  cl::init(true));
 
 // Option to run late CFG structurizer
 static cl::opt<bool> LateCFGStructurize(
@@ -139,6 +139,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index f13629a3185f..dfac068d1f69 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -35,9 +35,12 @@ struct FoldCandidate {
   };
   unsigned char UseOpNo;
   MachineOperand::MachineOperandType Kind;
+  bool Commuted;
 
-  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
-    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
+  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
+                bool Commuted_ = false) :
+    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
+    Commuted(Commuted_) {
     if (FoldOp->isImm()) {
       ImmToFold = FoldOp->getImm();
     } else if (FoldOp->isFI()) {
@@ -59,6 +62,10 @@ struct FoldCandidate {
   bool isReg() const {
     return Kind == MachineOperand::MO_Register;
   }
+
+  bool isCommuted() const {
+    return Commuted;
+  }
 };
 
 class SIFoldOperands : public MachineFunctionPass {
@@ -237,8 +244,13 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
-    if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
+    if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+      TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
       return false;
+    }
+
+    FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
+    return true;
   }
 
   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
@@ -699,6 +711,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
       tryFoldInst(TII, Fold.UseMI);
+    } else if (Fold.isCommuted()) {
+      // Restoring instruction's original operand order if fold has failed.
+      TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
 }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 76c2644867aa..b48b23911105 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3571,7 +3571,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
-                  isMemOpHasNoClobberedMemOperand(Load))
+        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e22166d03e9a..c10badba88f3 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1009,7 +1009,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
   // occurs before the instruction. Doing it here prevents any additional
   // S_WAITCNTs from being emitted if the instruction was marked as
   // requiring a WAITCNT beforehand.
-  if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) {
+  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
+      !ST->hasAutoWaitcntBeforeBarrier()) {
     EmitSwaitcnt |=
         ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
     EmitSwaitcnt |= ScoreBrackets->updateByWait(
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 9f32ecfa52ff..bc86515d8b1f 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -630,7 +630,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
       // but we also want to wait for any other outstanding transfers before
       // signalling other hardware blocks
       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
-               ST->needWaitcntBeforeBarrier()) ||
+               !ST->hasAutoWaitcntBeforeBarrier()) ||
            I->getOpcode() == AMDGPU::S_SENDMSG ||
            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
         Required = LastIssued;
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 5b840a14dbc3..73dd8b7daa4e 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -229,6 +229,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
     ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
     (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
+    !Ld->isVolatile() &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 001fc960b228..77fc9551cff9 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -245,9 +245,10 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
+def V_DIV_FIXUP_F16   : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+
 let isCommutable = 1 in {
 
-def V_DIV_FIXUP_F16   : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
 def V_FMA_F16         : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
 def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>;
 def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 46fd1f70ee99..ca68f5d42c32 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -205,6 +205,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
                                                "AvoidCPSRPartialUpdate", "true",
                                  "Avoid CPSR partial update for OOO execution">;
 
+/// Disable +1 predication cost for instructions updating CPSR.
+/// Enabled for Cortex-A57.
+def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
+                                                  "CheapPredicableCPSRDef",
+                                                  "true",
+                  "Disable +1 predication cost for instructions updating CPSR">;
+
 def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
                                             "AvoidMOVsShifterOperand", "true",
                                 "Avoid movs instructions with shifter operand">;
@@ -788,12 +795,14 @@ def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
                                                          FeatureCRC,
                                                          FeatureFPAO]>;
 
-def : ProcNoItin<"cortex-a57",                          [ARMv8a, ProcA57,
-                                                         FeatureHWDivThumb,
-                                                         FeatureHWDivARM,
-                                                         FeatureCrypto,
-                                                         FeatureCRC,
-                                                         FeatureFPAO]>;
+def : ProcessorModel<"cortex-a57",  CortexA57Model, [ARMv8a, ProcA57,
+                                                     FeatureHWDivThumb,
+                                                     FeatureHWDivARM,
+                                                     FeatureCrypto,
+                                                     FeatureCRC,
+                                                     FeatureFPAO,
+                                                     FeatureAvoidPartialCPSR,
+                                                     FeatureCheapPredicableCPSR]>;
 
 def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 5c9d589e2625..f8b65573f9cd 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -558,13 +558,68 @@ bool ARMBaseInstrInfo::DefinesPredicate(
   return Found;
 }
 
-static bool isCPSRDefined(const MachineInstr *MI) {
-  for (const auto &MO : MI->operands())
+bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) {
+  for (const auto &MO : MI.operands())
     if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
       return true;
   return false;
 }
 
+bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI,
+                                        unsigned Op) const {
+  const MachineOperand &Offset = MI.getOperand(Op + 1);
+  return Offset.getReg() != 0;
+}
+
+// Load with negative register offset requires additional 1cyc and +I unit
+// for Cortex A57
+bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI,
+                                             unsigned Op) const {
+  const MachineOperand &Offset = MI.getOperand(Op + 1);
+  const MachineOperand &Opc = MI.getOperand(Op + 2);
+  assert(Opc.isImm());
+  assert(Offset.isReg());
+  int64_t OpcImm = Opc.getImm();
+
+  bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub;
+  return (isSub && Offset.getReg() != 0);
+}
+
+bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI,
+                                       unsigned Op) const {
+  const MachineOperand &Opc = MI.getOperand(Op + 2);
+  unsigned OffImm = Opc.getImm();
+  return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
+}
+
+// Load, scaled register offset, not plus LSL2
+bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI,
+                                                  unsigned Op) const {
+  const MachineOperand &Opc = MI.getOperand(Op + 2);
+  unsigned OffImm = Opc.getImm();
+
+  bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add;
+  unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+  ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+  if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled
+  bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2);
+  return !SimpleScaled;
+}
+
+// Minus reg for ldstso addr mode
+bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI,
+                                        unsigned Op) const {
+  unsigned OffImm = MI.getOperand(Op + 2).getImm();
+  return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+}
+
+// Load, scaled register offset
+bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI,
+                                      unsigned Op) const {
+  unsigned OffImm = MI.getOperand(Op + 2).getImm();
+  return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
+}
+
 static bool isEligibleForITBlock(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return true;
@@ -590,7 +645,7 @@ static bool isEligibleForITBlock(const MachineInstr *MI) {
   case ARM::tSUBi3: // SUB (immediate) T1
   case ARM::tSUBi8: // SUB (immediate) T2
   case ARM::tSUBrr: // SUB (register) T1
-    return !isCPSRDefined(MI);
+    return !ARMBaseInstrInfo::isCPSRDefined(*MI);
   }
 }
 
@@ -3349,6 +3404,22 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
   return DefCycle;
 }
 
+bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
+  unsigned BaseReg = MI.getOperand(0).getReg();
+  for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
+    const auto &Op = MI.getOperand(i);
+    if (Op.isReg() && Op.getReg() == BaseReg)
+      return true;
+  }
+  return false;
+}
+unsigned
+ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
+  // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops
+  // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops)
+  return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
+}
+
 int
 ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
                                  const MCInstrDesc &DefMCID,
@@ -4119,7 +4190,8 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const {
 
   const MCInstrDesc &MCID = MI.getDesc();
 
-  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+  if (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) &&
+                        !Subtarget.cheapPredicableCPSRDef())) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
     return 1;
@@ -4148,7 +4220,8 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   }
 
   const MCInstrDesc &MCID = MI.getDesc();
-  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+  if (PredCost && (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) &&
+                                     !Subtarget.cheapPredicableCPSRDef()))) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
     *PredCost = 1;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index dd7fe871345a..c52e572786d4 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -159,6 +159,24 @@ public:
 
   bool isPredicable(const MachineInstr &MI) const override;
 
+  // CPSR defined in instruction
+  static bool isCPSRDefined(const MachineInstr &MI);
+  bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const;
+  bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const;
+
+  // Load, scaled register offset
+  bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const;
+  // Load, scaled register offset, not plus LSL2
+  bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const;
+  // Minus reg for ldstso addr mode
+  bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const;
+  // Scaled register offset in address mode 2
+  bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const;
+  // Load multiple, base reg in list
+  bool isLDMBaseRegInList(const MachineInstr &MI) const;
+  // get LDM variable defs size
+  unsigned getLDMVariableDefsSize(const MachineInstr &MI) const;
+
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 31a2f499a9a7..a33d025d114e 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -34,7 +34,7 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
 
 static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
                             Type *T) {
-  if (T->isArrayTy())
+  if (T->isArrayTy() || T->isStructTy())
     return true;
 
   EVT VT = TLI.getValueType(DL, T, true);
@@ -167,8 +167,11 @@ void ARMCallLowering::splitToValueTypes(
   if (SplitVTs.size() == 1) {
     // Even if there is no splitting to do, we still want to replace the
     // original type (e.g. pointer type -> integer).
-    SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
-                           OrigArg.Flags, OrigArg.IsFixed);
+    auto Flags = OrigArg.Flags;
+    unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty);
+    Flags.setOrigAlign(OriginalAlignment);
+    SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), Flags,
+                           OrigArg.IsFixed);
     return;
   }
 
@@ -177,6 +180,10 @@ void ARMCallLowering::splitToValueTypes(
     EVT SplitVT = SplitVTs[i];
     Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
     auto Flags = OrigArg.Flags;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
     bool NeedsConsecutiveRegisters =
         TLI.functionArgumentNeedsConsecutiveRegisters(
             SplitTy, F->getCallingConv(), F->isVarArg());
@@ -185,6 +192,7 @@ void ARMCallLowering::splitToValueTypes(
       if (i == e - 1)
         Flags.setInConsecutiveRegsLast();
     }
+
     SplitArgs.push_back(
         ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
                 SplitTy, Flags, OrigArg.IsFixed});
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index ec5b97cba8cd..1c7902520f2d 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -147,6 +147,9 @@ def : PredicateProlog<[{
   const ARMBaseInstrInfo *TII =
     static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
   (void)TII;
+  const ARMSubtarget *STI =
+    static_cast<const ARMSubtarget*>(SchedModel->getSubtargetInfo());
+  (void)STI;
 }]>;
 
 def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
@@ -420,3 +423,4 @@ include "ARMScheduleA8.td"
 include "ARMScheduleA9.td"
 include "ARMScheduleSwift.td"
 include "ARMScheduleR52.td"
+include "ARMScheduleA57.td"
diff --git a/lib/Target/ARM/ARMScheduleA57.td b/lib/Target/ARM/ARMScheduleA57.td
new file mode 100644
index 000000000000..525079d12d51
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleA57.td
@@ -0,0 +1,1471 @@
+//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// *** Common description and scheduling model parameters taken from AArch64 ***
+// The Cortex-A57 is a traditional superscalar microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
+def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
+def IsCPSRDefinedAndPredicatedPred :
+  SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
+
+// Cortex A57 rev. r1p0 or later (false = r0px)
+def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
+
+// If Addrmode3 contains register offset (not immediate)
+def IsLdrAm3RegOffPred :
+  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
+// The same predicate with operand offset 2 and 3:
+def IsLdrAm3RegOffPredX2 :
+  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
+def IsLdrAm3RegOffPredX3 :
+  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
+
+// If Addrmode3 contains "minus register"
+def IsLdrAm3NegRegOffPred :
+  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
+// The same predicate with operand offset 2 and 3:
+def IsLdrAm3NegRegOffPredX2 :
+  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
+def IsLdrAm3NegRegOffPredX3 :
+  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
+
+// Load, scaled register offset, not plus LSL2
+def IsLdstsoScaledNotOptimalPredX0 :
+  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
+def IsLdstsoScaledNotOptimalPred :
+  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
+def IsLdstsoScaledNotOptimalPredX2 :
+  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
+
+// Load, scaled register offset
+def IsLdstsoScaledPred :
+  SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
+def IsLdstsoScaledPredX2 :
+  SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
+
+def IsLdstsoMinusRegPredX0 :
+  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
+def IsLdstsoMinusRegPred :
+  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
+def IsLdstsoMinusRegPredX2 :
+  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
+
+// Load, scaled register offset
+def IsLdrAm2ScaledPred :
+  SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
+
+// LDM, base reg in list
+def IsLdmBaseRegInList :
+  SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
+
+class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
+  list <SchedWriteRes> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
+
+// *** Common description and scheduling model parameters taken from AArch64 ***
+// (AArch64SchedA57.td)
+def CortexA57Model : SchedMachineModel {
+  let IssueWidth        =   3; // 3-way decode and dispatch
+  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling.
+  let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1>;  // Type B micro-ops
+def A57UnitI : ProcResource<2>;  // Type I micro-ops
+def A57UnitM : ProcResource<1>;  // Type M micro-ops
+def A57UnitL : ProcResource<1>;  // Type L micro-ops
+def A57UnitS : ProcResource<1>;  // Type S micro-ops
+
+def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
+def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
+
+let SchedModel = CortexA57Model in {
+  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
+}
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "ARMScheduleA57WriteRes.td"
+
+// To have "CompleteModel = 1", support of pseudos and special instructions
+def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
+  "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
+  "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
+  "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
+  "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
+  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG",
+  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>;
+
+def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
+
+// Specific memory instrs
+def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
+  "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
+
+// coprocessor moves
+def : InstRW<[WriteNoop, WriteNoop], (instregex
+  "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
+  "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
+  "(t2)?MSR(banked|i|_AR|_M)?$")>;
+
+// Deprecated instructions
+def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
+
+// Pseudos
+def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
+  "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
+  "tLDRpci_pic", "t2SUBS_PC_LR",
+  "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
+  "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
+  "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
+  "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
+  "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
+  "WIN__CHKSTK", "WIN__DBZCHK")>;
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
+
+// --- 3.2 Branch Instructions ---
+// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
+
+def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
+  "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
+def : InstRW<[A57Write_1cyc_1B_1I],
+  (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
+// Pseudos
+def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
+def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
+  "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
+def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
+
+// --- 3.3 Arithmetic and Logical Instructions ---
+// ADD{S}, ADC{S}, ADR,	AND{S},	BIC{S},	CMN, CMP, EOR{S}, ORN{S}, ORR{S},
+// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
+
+def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
+
+// shift by register, conditional or unconditional
+// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
+// Why more complex instruction uses more simple pipeline?
+// May be an error in doc.
+def A57WriteALUsi : SchedWriteVariant<[
+  // lsl #2, lsl #1, or lsr #1.
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def A57WriteALUsr : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def A57WriteALUSsr : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def A57ReadALUsr : SchedReadVariant<[
+  SchedVar<IsPredicatedPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,      [ReadDefault]>
+]>;
+def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
+def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
+def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
+
+def A57WriteCMPsr : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
+def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
+
+// --- 3.4 Move and Shift Instructions ---
+// Move, basic
+// MOV{S}, MOVW, MVN{S}
+def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
+  "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
+  "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
+
+// Move, shift by immed, setflags/no setflags
+// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
+// setflags = isCPSRDefined
+def A57WriteMOVsi : SchedWriteVariant<[
+  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
+  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
+  "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
+  "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
+
+// shift by register, conditional or unconditional, setflags/no setflags
+def A57WriteMOVsr : SchedWriteVariant<[
+  SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
+  SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
+  "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
+  "(t2|t)RORrr")>;
+
+// Move, top
+// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
+def A57WriteMOVT : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
+  SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
+]>;
+def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
+
+def A57WriteI2pc :
+  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
+def A57WriteI2ld :
+  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
+def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+// +2cyc for branch forms
+def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
+
+// --- 3.5 Divide and Multiply Instructions ---
+// Divide: SDIV, UDIV
+// latency from documentration: 4 ‐ 20, maximum taken
+def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
+// Multiply: tMul not bound to common WriteRes types
+def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
+def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
+def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
+def : ReadAdvance<ReadMUL, 0>;
+
+// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
+// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
+// Multiply-accumulate pipelines support late-forwarding of accumulate operands
+// from similar μops, allowing a typical sequence of multiply-accumulate μops
+// to issue one every 1 cycle (sched advance = 2).
+def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
+def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
+def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
+
+def : SchedAlias<WriteMAC16, A57WriteMLA>;
+def : SchedAlias<WriteMAC32, A57WriteMLA>;
+def : SchedAlias<ReadMAC,    A57ReadMLA>;
+
+def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
+def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
+
+// Multiply long: SMULL, UMULL
+def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
+def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
+
+// --- 3.6 Saturating and Parallel Arithmetic Instructions ---
+// Parallel	arith
+// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
+// Conditional GE-setting instructions require three extra μops
+// and two additional cycles to conditionally update the GE field.
+def A57WriteParArith : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
+]>;
+def : InstRW< [A57WriteParArith], (instregex
+  "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
+  "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
+
+// Parallel	arith with exchange: SASX, SSAX, UASX, USAX
+def A57WriteParArithExch : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
+  SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
+]>;
+def : InstRW<[A57WriteParArithExch],
+  (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
+
+// Parallel	halving	arith
+// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,	UHSUB8
+def : InstRW<[A57Write_2cyc_1M], (instregex
+  "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
+  "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
+
+// Parallel halving arith with exchange
+// SHASX, SHSAX, UHASX, UHSAX
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
+  "(t2)?UHASX", "(t2)?UHSAX")>;
+
+// Parallel	saturating arith
+// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
+def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
+  "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
+
+// Parallel	saturating arith with exchange
+// QASX, QSAX, UQASX, UQSAX
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
+  "(t2)?UQASX", "(t2)?UQSAX")>;
+
+// Saturate: SSAT, SSAT16, USAT, USAT16
+def : InstRW<[A57Write_2cyc_1M],
+  (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
+
+// Saturating arith: QADD, QSUB
+def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
+
+// Saturating doubling arith: QDADD, QDSUB
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
+
+// --- 3.7 Miscellaneous Data-Processing Instructions ---
+// Bit field extract: SBFX, UBFX
+def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
+
+// Bit field insert/clear: BFI, BFC
+def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
+
+// Select bytes, conditional/unconditional
+def A57WriteSEL : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
+
+// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
+def : InstRW<[A57Write_1cyc_1I],
+  (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
+
+// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
+def : InstRW<[A57Write_2cyc_1M],
+  (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
+
+// Sign/zero extend and add, parallel: SXTAB16, UXTAB16
+def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
+
+// Sum of absolute differences: USAD8, USADA8
+def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
+
+// --- 3.8 Load Instructions ---
+
+// Load, immed offset
+// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
+def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
+  "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
+  "PICLDR", "tLDR")>;
+
+def : InstRW<[A57Write_4cyc_1L],
+  (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
+
+// For "Load, register offset, minus" we need +1cyc, +1I
+def A57WriteLdrAm3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
+def A57WriteLdrAm3X2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
+def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
+
+def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
+  SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
+
+def A57WrBackOne : SchedWriteRes<[]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+def A57WrBackTwo : SchedWriteRes<[]> {
+  let Latency = 2;
+  let NumMicroOps = 0;
+}
+def A57WrBackThree : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+
+// --- LDR pre-indexed ---
+// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
+  "LDRB_PRE_IMM", "t2LDRB_PRE")>;
+
+// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
+// (5 cyc load result for not-lsl2 scaled)
+def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
+]>;
+def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
+  (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
+
+def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,          [A57WrBackOne]>
+]>;
+def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
+  (instregex "LDR(H|SH|SB)_PRE")>;
+def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
+  (instregex "t2LDR(H|SH|SB)?_PRE")>;
+
+// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
+def A57WriteLdrDAm3Pre : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
+]>;
+def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,          [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
+  (instregex "LDRD_PRE")>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
+  (instregex "t2LDRD_PRE")>;
+
+// --- LDR post-indexed ---
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
+  "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
+
+def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,        [A57WrBackOne]>
+]>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
+  (instregex "LDR(H|SH|SB)_POST")>;
+def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
+  (instregex "t2LDR(H|SH|SB)?_POST")>;
+
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
+  "LDRB_POST_REG", "LDR(B?)T_POST$")>;
+
+def A57WriteLdrTRegPost : SchedWriteVariant<[
+  SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
+  SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
+]>;
+def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
+  SchedVar<NoSchedPred,        [A57WrBackTwo]>
+]>;
+// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
+def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
+  (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
+
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
+
+def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,          [A57WrBackOne]>
+]>;
+// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+  A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
+  (instregex "t2LDRD_POST")>;
+
+// --- Preload instructions ---
+// Preload, immed offset
+def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
+  "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
+
+// Preload, register offset,
+// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
+// otherwise 4cyc "L"
+def A57WritePLD : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
+  SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
+
+// --- Load multiple instructions ---
+foreach NumAddr = 1-8 in {
+  def A57LMAddrPred#NumAddr :
+    SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
+}
+
+def A57LDMOpsListNoregin : A57WriteLMOpsListType<
+                [A57Write_3cyc_1L, A57Write_3cyc_1L,
+                 A57Write_4cyc_1L, A57Write_4cyc_1L,
+                 A57Write_5cyc_1L, A57Write_5cyc_1L,
+                 A57Write_6cyc_1L, A57Write_6cyc_1L,
+                 A57Write_7cyc_1L, A57Write_7cyc_1L,
+                 A57Write_8cyc_1L, A57Write_8cyc_1L,
+                 A57Write_9cyc_1L, A57Write_9cyc_1L,
+                 A57Write_10cyc_1L, A57Write_10cyc_1L]>;
+def A57WriteLDMnoreginlist : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
+  SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57LDMOpsListRegin : A57WriteLMOpsListType<
+                [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+                 A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
+                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
+def A57WriteLDMreginlist : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
+  SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57LDMOpsList_Upd : A57WriteLMOpsListType<
+              [A57WrBackOne,
+               A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
+               A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+               A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+               A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+               A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+               A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+               A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+               A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
+def A57WriteLDM_Upd : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
+  SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
+  SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
+  SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
+  SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
+  SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
+  SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
+  SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
+  SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
+]> { let Variadic=1; }
+
+def A57WriteLDM : SchedWriteVariant<[
+  SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
+  SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
+
+// TODO: no writeback latency defined in documentation (implemented as 1 cyc)
+def : InstRW<[A57WriteLDM_Upd],
+  (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
+
+// --- 3.9 Store Instructions ---
+
+// Store, immed offset
+def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
+  "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
+
+// Store, register offset
+// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
+// otherwise 1cyc S.
+def A57WriteStrAmLDSTSO : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
+  SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
+
+// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
+def A57WriteStrAm3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
+def A57WriteStrAm3X2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
+
+// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
+  "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
+  "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
+
+// Store, register pre-indexed:
+// 1(1) "S, I0/I1" for plus reg
+// 3(2) "I0/I1, S" for minus reg
+// 1(2) "S, M" for scaled plus lsl2
+// 3(2) "I0/I1, S" for other scaled
+def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
+  SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
+  SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
+  SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
+  SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,                    [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
+  (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
+
+// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
+// 1(1) "S, I0/I1" for imm or reg plus
+// 3(2) "I0/I1, S" for reg minus
+def A57WriteStrAm3PreX2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,             [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
+  (instregex "STRH_PRE")>;
+
+def A57WriteStrAm3PreX3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,             [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
+  (instregex "STRD_PRE")>;
+
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
+  "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
+
+// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
+def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
+  "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
+
+// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
+// 1(1) "S, I0/I1" both for reg or imm
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
+  (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
+
+// --- Store multiple instructions ---
+// TODO: no writeback latency defined in documentation
+def A57WriteSTM : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
+]>;
+def A57WriteSTM_Upd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
+]>;
+
+def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
+def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
+  (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
+
+// --- 3.10 FP Data Processing Instructions ---
+def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
+def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
+
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
+
+// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
+def A57WriteVcmp : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
+  SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
+]>;
+def : InstRW<[A57WriteVcmp],
+  (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
+
+// fp convert
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
+
+def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
+
+// FP round to integral
+def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
+
+// FP divide, FP square root
+def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
+def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
+def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
+def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
+
+// FP max/min
+def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
+
+// FP multiply-accumulate pipelines support late forwarding of the result
+// from FP multiply μops to the accumulate operands of an
+// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
+// after the FP multiply μop has been issued
+// FP multiply, FZ
+def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+
+def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
+def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
+def : ReadAdvance<ReadFPMUL, 0>;
+
+// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
+// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
+def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+
+// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
+// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
+// Currently, there is no way to define different read advances for VFMA operand
+// from VFMA or from VMUL, so there will be 5 read advance.
+// Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
+// The same situation with ASIMD VMUL/VFMA instructions
+// def A57ReadVFMA : SchedRead;
+// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
+// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
+def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
+
+def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
+def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
+def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
+
+def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
+
+// --- 3.11 FP Miscellaneous Instructions ---
+// VMOV: 3cyc "F0/F1" for imm/reg
+def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
+
+// 5cyc L for FP transfer, vfp to core reg,
+// 5cyc L for FP transfer, core reg to vfp
+def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
+// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
+def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
+
+// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
+def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
+
+// --- 3.12 FP Load Instructions ---
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
+
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
+
+// FP load multiple (VLDM)
+
+def A57VLDMOpsListUncond : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L, A57Write_5cyc_1L,
+                A57Write_6cyc_1L, A57Write_6cyc_1L,
+                A57Write_7cyc_1L, A57Write_7cyc_1L,
+                A57Write_8cyc_1L, A57Write_8cyc_1L,
+                A57Write_9cyc_1L, A57Write_9cyc_1L,
+                A57Write_10cyc_1L, A57Write_10cyc_1L,
+                A57Write_11cyc_1L, A57Write_11cyc_1L,
+                A57Write_12cyc_1L, A57Write_12cyc_1L]>;
+def A57WriteVLDMuncond : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57VLDMOpsListCond : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L, A57Write_6cyc_1L,
+                A57Write_7cyc_1L, A57Write_8cyc_1L,
+                A57Write_9cyc_1L, A57Write_10cyc_1L,
+                A57Write_11cyc_1L, A57Write_12cyc_1L,
+                A57Write_13cyc_1L, A57Write_14cyc_1L,
+                A57Write_15cyc_1L, A57Write_16cyc_1L,
+                A57Write_17cyc_1L, A57Write_18cyc_1L,
+                A57Write_19cyc_1L, A57Write_20cyc_1L]>;
+def A57WriteVLDMcond : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57WriteVLDM : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
+  SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
+
+def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
+                A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
+                A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
+def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
+                A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
+                A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
+                A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
+                A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
+                A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
+                A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
+                A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
+def A57WriteVLDMcond_UPD : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57WriteVLDM_UPD : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
+  SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
+  (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
+
+// --- 3.13 FP Store Instructions ---
+def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
+
+def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
+
+def A57WriteVSTMs : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
+]>;
+def A57WriteVSTMd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
+    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
+    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
+    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
+    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
+    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
+    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
+    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
+    SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
+]>;
+def A57WriteVSTMs_Upd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
+]>;
+def A57WriteVSTMd_Upd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
+]>;
+
+def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
+def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
+def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
+  (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
+def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
+  (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
+
+// --- 3.14 ASIMD Integer Instructions ---
+
+// ASIMD absolute diff, 3cyc F0/F1 for integer VABD
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
+
+// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
+def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
+def : InstRW<[A57WriteVABAD, A57ReadVABAD],
+  (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
+def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
+def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
+def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
+  (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
+
+// ASIMD absolute diff accum long: 4(1) F1 for VABAL
+def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
+def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
+
+// ASIMD absolute diff long: 3cyc F0/F1 for VABDL
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
+
+// ASIMD arith, basic
+def : InstRW<[A57Write_3cyc_1V], (instregex "VADD", "VADDL", "VADDW",
+  "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
+  "VPADDi", "VPADDL", "VSUB", "VSUBL", "VSUBW")>;
+
+// ASIMD arith, complex
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
+  "VQABS", "VQADD", "VQNEG", "VQSUB",
+  "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
+
+// ASIMD compare
+def : InstRW<[A57Write_3cyc_1V],
+  (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
+
+// ASIMD logical
+def : InstRW<[A57Write_3cyc_1V],
+  (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
+
+// ASIMD max/min
+def : InstRW<[A57Write_3cyc_1V],
+  (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
+
+// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
+// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
+// and multiply-with-accumulate instructions relative to r0pX.
+def A57WriteVMULD_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def : InstRW<[A57WriteVMULD_VecInt], (instregex
+  "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
+  "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
+
+// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
+def A57WriteVMULQ_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
+def : InstRW<[A57WriteVMULQ_VecInt], (instregex
+  "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
+  "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate, D-form
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAD_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def A57ReadVMLAD_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
+  (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
+
+// ASIMD multiply accumulate, Q-form
+// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
+def A57ReadVMLAQ_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
+  (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAL_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def A57ReadVMLAL_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
+  (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
+
+// ASIMD multiply accumulate saturating long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
+// (3 or 2 ReadAdvance)
+def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
+  (instregex "VQDMLAL", "VQDMLSL")>;
+
+// ASIMD multiply long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
+def A57WriteVMULL_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def : InstRW<[A57WriteVMULL_VecInt],
+  (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
+
+// ASIMD pairwise add and accumulate
+// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
+def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
+def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
+
+// ASIMD shift accumulate
+// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
+def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
+def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[A57Write_3cyc_1X],
+  (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
+  "VRSHRN")>;
+
+// ASIMD shift by immed and insert, basic, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by immed and insert, basic, Q-form
+def : InstRW<[A57Write_5cyc_1X], (instregex
+  "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, basic, D-form
+def : InstRW<[A57Write_3cyc_1X], (instregex
+  "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+// VQRSHL, VQSHL, VRSHL
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
+  "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_1X], (instregex
+  "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
+  "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
+
+// --- 3.15 ASIMD Floating-Point Instructions ---
+// ASIMD FP absolute value
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
+
+// ASIMD FP arith
+def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
+  "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
+
+// ASIMD FP compare
+def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
+  "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
+
+// ASIMD FP convert, integer
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
+  "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
+  "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
+
+// ASIMD FP convert, half-precision: 8cyc F0/F1
+def : InstRW<[A57Write_8cyc_1V], (instregex
+  "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
+  "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
+  "VCVT(f2h|h2f)")>;
+
+// ASIMD FP max/min
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
+
+// ASIMD FP multiply
+def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
+
+// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
+def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57ReadVMLA_VecFP  :
+  SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
+def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
+  (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
+
+// ASIMD FP negate
+def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
+
+// ASIMD FP round to integral
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
+
+// --- 3.16 ASIMD Miscellaneous Instructions ---
+
+// ASIMD bitwise insert
+def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
+
+// ASIMD count
+def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
+
+// ASIMD duplicate, core reg: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
+
+// ASIMD duplicate, scalar: 3cyc "F0/F1"
+def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
+
+// ASIMD extract
+def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
+
+// ASIMD move, immed
+def : InstRW<[A57Write_3cyc_1V], (instregex
+  "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
+  "VMOVQ0")>;
+
+// ASIMD move, narrowing
+def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
+
+// ASIMD reciprocal estimate
+def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
+
+// ASIMD reciprocal step, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
+
+// ASIMD reverse, swap, table lookup (1-2 reg)
+def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
+
+// ASIMD table lookup (3-4 reg)
+def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
+
+// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
+def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
+
+// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
+
+// ASIMD transpose
+def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
+
+// ASIMD unzip/zip, D-form
+def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
+  (instregex "VUZPd", "VZIPd")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
+  (instregex "VUZPq", "VZIPq")>;
+
+// --- 3.17 ASIMD Load Instructions ---
+
+// Overriden via InstRW for this processor.
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
+  (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+
+// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
+def : InstRW<[A57Write_6cyc_1L],
+  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
+
+def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
+  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
+
+// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex
+  "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
+  "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V],
+      (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+
+// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD2b(8|16|32)wb")>;
+
+// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+      (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+                 "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+// 2 results + wb result
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
+      (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+// 1 result + wb result
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
+                 "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
+// 3 results
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
+      (instregex "VLD3(d|q)(8|16|32)$")>;
+// 1 result
+def : InstRW<[A57Write_9cyc_1L_1V],
+      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+// 3 results + wb
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+// 1 result + wb
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+      (instregex "VLD3LN(d|q)32$",
+                 "VLD3LN(d|q)32Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)32_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
+
+// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
+      (instregex "VLD3LN(d|q)(8|16)$",
+                 "VLD3LN(d|q)(8|16)Pseudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)(8|16)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
+
+// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+      (instregex "VLD3DUP(d|q)(8|16|32)$",
+                 "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
+              A57Write_9cyc_1L_1V],
+      (instregex "VLD4(d|q)(8|16|32)$")>;
+def : InstRW<[A57Write_9cyc_1L_1V],
+      (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
+              A57Write_8cyc_1L_1V],
+      (instregex "VLD4LN(d|q)32$",
+                 "VLD4LN(d|q)32Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57WrBackOne],
+      (instregex "VLD4LN(d|q)32_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
+
+// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
+              A57Write_9cyc_1L_1V],
+      (instregex "VLD4LN(d|q)(8|16)$",
+                 "VLD4LN(d|q)(8|16)Pseudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57WrBackOne],
+      (instregex "VLD4LN(d|q)(8|16)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
+
+// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
+              A57Write_8cyc_1L_1V],
+      (instregex "VLD4DUP(d|q)(8|16|32)$",
+                 "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57WrBackOne],
+      (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
+
+// --- 3.18 ASIMD Store Instructions ---
+
+// ASIMD store, 1 element, multiple, 1 reg: 1cyc S
+def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
+      (instregex "VST1d(8|16|32|64)wb")>;
+// ASIMD store, 1 element, multiple, 2 reg: 2cyc S
+def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
+def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
+      (instregex "VST1q(8|16|32|64)wb")>;
+// ASIMD store, 1 element, multiple, 3 reg: 3cyc S
+def : InstRW<[A57Write_3cyc_1S],
+      (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
+      (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+// ASIMD store, 1 element, multiple, 4 reg: 4cyc S
+def : InstRW<[A57Write_4cyc_1S],
+      (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
+      (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST2(d|b)(8|16|32)$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST2(b|d)(8|16|32)wb")>;
+// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
+def : InstRW<[A57Write_4cyc_1S_1V],
+      (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
+      (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST2LN(d|q)(8|16|32)_UPD",
+                 "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 3 element, multiple, 3 reg
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST3(d|q)(8|16|32)_UPD",
+                 "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+// ASIMD store, 3 element, one lane
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST3LN(d|q)(8|16|32)_UPD",
+                 "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 4 element, multiple, 4 reg
+def : InstRW<[A57Write_4cyc_1S_1V],
+      (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
+      (instregex "VST4(d|q)(8|16|32)_UPD",
+                 "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+// ASIMD store, 4 element, one lane
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST4LN(d|q)(8|16|32)_UPD",
+                 "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+// --- 3.19 Cryptography Extensions ---
+// Crypto AES ops
+// AESD, AESE, AESIMC, AESMC: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
+// Crypto SHA1 xor ops: 6cyc F0/F1
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+// Crypto SHA1 fast ops: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+// Crypto SHA1 slow ops: 6cyc F0
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+// Crypto SHA256 fast ops: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+// Crypto SHA256 slow ops: 6cyc F0
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+
+// --- 3.20 CRC ---
+def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
+
+// -----------------------------------------------------------------------------
+// Common definitions
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
+
+def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
+def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
+def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
+
+def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
+def : SchedAlias<WriteST, A57Write_1cyc_1S>;
+def : ReadAdvance<ReadALU, 0>;
+
+} // SchedModel = CortexA57Model
+
diff --git a/lib/Target/ARM/ARMScheduleA57WriteRes.td b/lib/Target/ARM/ARMScheduleA57WriteRes.td
new file mode 100644
index 000000000000..670717dc7c13
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleA57WriteRes.td
@@ -0,0 +1,323 @@
+//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: A57Write
+//   Latency: #cyc
+//   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+//      11 micro-ops to be issued as follows: one to I pipe, six to S pipes and
+//      four to V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
+def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
+                                                    let ResourceCycles = [17]; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
+                                                    let ResourceCycles = [18]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+                                                    let ResourceCycles = [19]; }
+def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20;
+                                                    let ResourceCycles = [20]; }
+def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
+def A57Write_2cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 2;  }
+def A57Write_3cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 3;  }
+def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
+def A57Write_2cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 2;  }
+def A57Write_3cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 3;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+                                                    let ResourceCycles = [35]; }
+def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
+def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
+def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
+def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+
+// A57Write_3cyc_1L - A57Write_20cyc_1L
+foreach Lat = 3-20 in {
+  def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> {
+    let Latency = Lat;
+  }
+}
+
+// A57Write_4cyc_1S - A57Write_16cyc_1S
+foreach Lat = 4-16 in {
+  def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> {
+    let Latency = Lat;
+  }
+}
+
+def A57Write_4cyc_1M  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_4cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 4;  }
+def A57Write_5cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 5;  }
+def A57Write_6cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 6;  }
+def A57Write_6cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 6;  }
+def A57Write_8cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 8;  }
+def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
+def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 64;
+  let NumMicroOps = 2;
+  let ResourceCycles = [32, 32];
+}
+def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L     : SchedWriteRes<[A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V    : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1S_1I  : SchedWriteRes<[A57UnitS,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1S_1I  : SchedWriteRes<[A57UnitS,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1I  : SchedWriteRes<[A57UnitS,
+                                          A57UnitI]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1S_1M  : SchedWriteRes<[A57UnitS,
+                                          A57UnitM]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1B_1L  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S     : SchedWriteRes<[A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 36;
+  let NumMicroOps = 2;
+  let ResourceCycles = [18, 18];
+}
+def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+// A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I
+foreach Lat = 3-20 in {
+  def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> {
+    let Latency = Lat; let NumMicroOps = 2;
+  }
+}
+
+def A57Write_3cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+// A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I
+foreach Lat = 4-16 in {
+  def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> {
+    let Latency = Lat; let NumMicroOps = 2;
+  }
+}
+
+def A57Write_4cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S     : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1S_1V_1I  : SchedWriteRes<[A57UnitS,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_4cyc_1S_1V_1I  : SchedWriteRes<[A57UnitS,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+def A57Write_4cyc_1I_1L_1M  : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_1V_1I  : SchedWriteRes<[A57UnitL,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_9cyc_1L_1V_1I  : SchedWriteRes<[A57UnitL,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index d2630685d91b..af682dd8321c 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -234,6 +234,10 @@ protected:
   /// CPSR setting instruction.
   bool AvoidCPSRPartialUpdate = false;
 
+  /// CheapPredicableCPSRDef - If true, disable +1 predication cost
+  /// for instructions updating CPSR. Enabled for Cortex-A57.
+  bool CheapPredicableCPSRDef = false;
+
   /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
   /// movs with shifter operand (i.e. asr, lsl, lsr).
   bool AvoidMOVsShifterOperand = false;
@@ -543,6 +547,7 @@ public:
   bool nonpipelinedVFP() const { return NonpipelinedVFP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+  bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
   bool hasRetAddrStack() const { return HasRetAddrStack; }
   bool hasMPExtension() const { return HasMPExtension; }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 0fef91ec4d3e..b76da727237c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -3419,9 +3419,7 @@ int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
   int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
   if (NewOpcode >= 0)
     return NewOpcode;
-
-  dbgs() << "Cannot convert to .new: " << getName(MI.getOpcode()) << '\n';
-  llvm_unreachable(nullptr);
+  return 0;
 }
 
 int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const {
diff --git a/lib/Target/Mips/MicroMipsSizeReduction.cpp b/lib/Target/Mips/MicroMipsSizeReduction.cpp
index 4593fc92ca6f..35948e36ad91 100644
--- a/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -135,6 +135,14 @@ private:
   // returns true on success.
   static bool ReduceXWtoXWSP(MachineInstr *MI, const ReduceEntry &Entry);
 
+  // Attempts to reduce LBU/LHU instruction into LBU16/LHU16,
+  // returns true on success.
+  static bool ReduceLXUtoLXU16(MachineInstr *MI, const ReduceEntry &Entry);
+
+  // Attempts to reduce SB/SH instruction into SB16/SH16,
+  // returns true on success.
+  static bool ReduceSXtoSX16(MachineInstr *MI, const ReduceEntry &Entry);
+
   // Attempts to reduce arithmetic instructions, returns true on success
   static bool ReduceArithmeticInstructions(MachineInstr *MI,
                                            const ReduceEntry &Entry);
@@ -162,10 +170,26 @@ llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
     {RT_OneInstr, OpCodes(Mips::ADDu_MM, Mips::ADDU16_MM),
      ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
      ImmField(0, 0, 0, -1)},
+    {RT_OneInstr, OpCodes(Mips::LBu, Mips::LBU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)},
+    {RT_OneInstr, OpCodes(Mips::LBu_MM, Mips::LBU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)},
+    {RT_OneInstr, OpCodes(Mips::LHu, Mips::LHU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::LHu_MM, Mips::LHU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
     {RT_OneInstr, OpCodes(Mips::LW, Mips::LWSP_MM), ReduceXWtoXWSP,
      OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
     {RT_OneInstr, OpCodes(Mips::LW_MM, Mips::LWSP_MM), ReduceXWtoXWSP,
      OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+    {RT_OneInstr, OpCodes(Mips::SB, Mips::SB16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SB_MM, Mips::SB16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SH, Mips::SH16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SH_MM, Mips::SH16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
     {RT_OneInstr, OpCodes(Mips::SUBu, Mips::SUBU16_MM),
      ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
      ImmField(0, 0, 0, -1)},
@@ -193,6 +217,13 @@ static bool isMMThreeBitGPRegister(const MachineOperand &MO) {
   return false;
 }
 
+// Returns true if the machine operand MO is register $0, $17, or $2-$7.
+static bool isMMSourceRegister(const MachineOperand &MO) {
+  if (MO.isReg() && Mips::GPRMM16ZeroRegClass.contains(MO.getReg()))
+    return true;
+  return false;
+}
+
 // Returns true if the operand Op is an immediate value
 // and writes the immediate value into variable Imm
 static bool GetImm(MachineInstr *MI, unsigned Op, int64_t &Imm) {
@@ -279,6 +310,32 @@ bool MicroMipsSizeReduce::ReduceArithmeticInstructions(
   return ReplaceInstruction(MI, Entry);
 }
 
+bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI,
+                                           const ReduceEntry &Entry) {
+
+  if (!ImmInRange(MI, Entry))
+    return false;
+
+  if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
+      !isMMThreeBitGPRegister(MI->getOperand(1)))
+    return false;
+
+  return ReplaceInstruction(MI, Entry);
+}
+
+bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI,
+                                         const ReduceEntry &Entry) {
+
+  if (!ImmInRange(MI, Entry))
+    return false;
+
+  if (!isMMSourceRegister(MI->getOperand(0)) ||
+      !isMMThreeBitGPRegister(MI->getOperand(1)))
+    return false;
+
+  return ReplaceInstruction(MI, Entry);
+}
+
 bool MicroMipsSizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
   MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 54619589c341..35a67134775a 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -88,6 +88,3 @@ pr45695.c wasm-o
 pr49279.c wasm-o
 pr49390.c wasm-o
 pr52286.c wasm-o
-
-# fatal error: error in backend: data symbols must have a size set with .size
-921110-1.c wasm-o
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0a41f35f9320..5303d7a406ad 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4753,7 +4753,7 @@ static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
                              SmallVectorImpl<int> &ScaledMask) {
   assert(0 < Scale && "Unexpected scaling factor");
   int NumElts = Mask.size();
-  ScaledMask.assign(NumElts * Scale, -1);
+  ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
 
   for (int i = 0; i != NumElts; ++i) {
     int M = Mask[i];
@@ -5848,17 +5848,39 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     return true;
   }
   case ISD::SCALAR_TO_VECTOR: {
-    // Match against a scalar_to_vector of an extract from a similar vector.
+    // Match against a scalar_to_vector of an extract from a vector,
+    // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
     SDValue N0 = N.getOperand(0);
-    if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        N0.getOperand(0).getValueType() != VT ||
-        !isa<ConstantSDNode>(N0.getOperand(1)) ||
-        NumElts <= N0.getConstantOperandVal(1) ||
-        !N->isOnlyUserOf(N0.getNode()))
+    SDValue SrcExtract;
+
+    if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        N0.getOperand(0).getValueType() == VT) {
+      SrcExtract = N0;
+    } else if (N0.getOpcode() == ISD::AssertZext &&
+               N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
+               cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
+      SrcExtract = N0.getOperand(0);
+      assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
+    } else if (N0.getOpcode() == ISD::AssertZext &&
+               N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
+               cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
+      SrcExtract = N0.getOperand(0);
+      assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
+    }
+
+    if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)) ||
+        NumElts <= SrcExtract.getConstantOperandVal(1))
       return false;
-    Ops.push_back(N0.getOperand(0));
-    Mask.push_back(N0.getConstantOperandVal(1));
-    Mask.append(NumElts - 1, SM_SentinelUndef);
+
+    SDValue SrcVec = SrcExtract.getOperand(0);
+    EVT SrcVT = SrcVec.getValueType();
+    unsigned NumSrcElts = SrcVT.getVectorNumElements();
+    unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
+
+    Ops.push_back(SrcVec);
+    Mask.push_back(SrcExtract.getConstantOperandVal(1));
+    Mask.append(NumZeros, SM_SentinelZero);
+    Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
     return true;
   }
   case X86ISD::PINSRB:
@@ -6542,12 +6564,12 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
-      assert((ScalarSize == 32 || ScalarSize == 64) &&
-             "Unsupported floating point scalar size");
-      if (ScalarSize == 32)
-        Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
-      else
-        Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+      if (ScalarSize == 32) {
+        Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
+      } else {
+        assert(ScalarSize == 64 && "Unsupported floating point scalar size");
+        Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
+      }
     } else
       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
     ConstantVec.push_back(Const);
@@ -6633,11 +6655,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
           // AVX have support for 32 and 64 bit broadcast for floats only.
           // No 64bit integer in 32bit subtarget.
           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
-          Constant *C = SplatBitSize == 32
-                            ? ConstantFP::get(Type::getFloatTy(*Ctx),
-                                              SplatValue.bitsToFloat())
-                            : ConstantFP::get(Type::getDoubleTy(*Ctx),
-                                              SplatValue.bitsToDouble());
+          // Lower the splat via APFloat directly, to avoid any conversion.
+          Constant *C =
+              SplatBitSize == 32
+                  ? ConstantFP::get(*Ctx,
+                                    APFloat(APFloat::IEEEsingle(), SplatValue))
+                  : ConstantFP::get(*Ctx,
+                                    APFloat(APFloat::IEEEdouble(), SplatValue));
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
@@ -8003,7 +8027,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
                                   ArrayRef<int> Mask,
                                   SmallVectorImpl<int> &RepeatedMask) {
-  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+  auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   RepeatedMask.assign(LaneSize, -1);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
@@ -16997,7 +17021,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
-  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   SDLoc dl(Op);
 
@@ -17024,18 +17048,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
     // available.
     SDValue Cmp;
-    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+    unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
     if (SSECC == 8) {
       // LLVM predicate is SETUEQ or SETONE.
       unsigned CC0, CC1;
       unsigned CombineOpc;
-      if (SetCCOpcode == ISD::SETUEQ) {
+      if (Cond == ISD::SETUEQ) {
         CC0 = 3; // UNORD
         CC1 = 0; // EQ
         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
                                            static_cast<unsigned>(ISD::OR);
       } else {
-        assert(SetCCOpcode == ISD::SETONE);
+        assert(Cond == ISD::SETONE);
         CC0 = 7; // ORD
         CC1 = 4; // NEQ
         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
@@ -17082,7 +17106,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // 2. The original operand type has been promoted to a 256-bit vector.
     //
     // Note that condition 2. only applies for AVX targets.
-    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
     return DAG.getZExtOrTrunc(NewOp, dl, VT);
   }
 
@@ -17122,7 +17146,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
-    switch (SetCCOpcode) {
+    switch (Cond) {
     default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETULT:
     case ISD::SETLT: CmpMode = 0x00; break;
@@ -17137,60 +17161,49 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     }
 
     // Are we comparing unsigned or signed integers?
-    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
-      ? X86ISD::VPCOMU : X86ISD::VPCOM;
+    unsigned Opc =
+        ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
 
     return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(CmpMode, dl, MVT::i8));
   }
 
-  // We are handling one of the integer comparisons here.  Since SSE only has
+  // We are handling one of the integer comparisons here. Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
-  unsigned Opc;
-  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
-  bool Subus = false;
-
-  switch (SetCCOpcode) {
-  default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
-  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
-  case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
-  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
-  case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
-  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
-                    Invert = true; break;
-  case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
-  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
-                    FlipSigns = true; break;
-  case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
-  case ISD::SETULE: Opc = X86ISD::PCMPGT;
-                    FlipSigns = true; Invert = true; break;
-  }
+  unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+                                                            : X86ISD::PCMPGT;
+  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+              Cond == ISD::SETGE || Cond == ISD::SETUGE;
+  bool Invert = Cond == ISD::SETNE ||
+                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+  bool FlipSigns = ISD::isUnsignedIntSetCC(Cond);
 
   // Special case: Use min/max operations for SETULE/SETUGE
   MVT VET = VT.getVectorElementType();
-  bool hasMinMax =
-       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
-    || (Subtarget.hasSSE2()  && (VET == MVT::i8));
-
-  if (hasMinMax) {
-    switch (SetCCOpcode) {
+  bool HasMinMax =
+      (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
+      (Subtarget.hasSSE2() && (VET == MVT::i8));
+  bool MinMax = false;
+  if (HasMinMax) {
+    switch (Cond) {
     default: break;
     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
     }
 
-    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+    if (MinMax)
+      Swap = Invert = FlipSigns = false;
   }
 
-  bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
-  if (!MinMax && hasSubus) {
+  bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  bool Subus = false;
+  if (!MinMax && HasSubus) {
     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
     // Op0 u<= Op1:
     //   t = psubus Op0, Op1
     //   pcmpeq t, <0..0>
-    switch (SetCCOpcode) {
+    switch (Cond) {
     default: break;
     case ISD::SETULT: {
       // If the comparison is against a constant we can turn this into a
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 613b4a7f03e9..626a891f65c6 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -228,7 +228,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   SmallVector<ReturnInst *, 4> Returns;
 
-  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/false, Returns);
+  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
 
   // Remove old returns.
   for (ReturnInst *Return : Returns)
diff --git a/lib/Transforms/Coroutines/Coroutines.cpp b/lib/Transforms/Coroutines/Coroutines.cpp
index ea48043f9381..44e1f9b404ed 100644
--- a/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/lib/Transforms/Coroutines/Coroutines.cpp
@@ -218,6 +218,8 @@ void coro::Shape::buildFrom(Function &F) {
   size_t FinalSuspendIndex = 0;
   clear(*this);
   SmallVector<CoroFrameInst *, 8> CoroFrames;
+  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
+
   for (Instruction &I : instructions(F)) {
     if (auto II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
@@ -229,6 +231,12 @@ void coro::Shape::buildFrom(Function &F) {
       case Intrinsic::coro_frame:
         CoroFrames.push_back(cast<CoroFrameInst>(II));
         break;
+      case Intrinsic::coro_save:
+        // After optimizations, coro_suspends using this coro_save might have
+        // been removed, remember orphaned coro_saves to remove them later.
+        if (II->use_empty())
+          UnusedCoroSaves.push_back(cast<CoroSaveInst>(II));
+        break;
       case Intrinsic::coro_suspend:
         CoroSuspends.push_back(cast<CoroSuspendInst>(II));
         if (CoroSuspends.back()->isFinal()) {
@@ -311,4 +319,8 @@ void coro::Shape::buildFrom(Function &F) {
   if (HasFinalSuspend &&
       FinalSuspendIndex != CoroSuspends.size() - 1)
     std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
+
+  // Remove orphaned coro.saves.
+  for (CoroSaveInst *CoroSave : UnusedCoroSaves)
+    CoroSave->eraseFromParent();
 }
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 28cc81c76d4f..5cc29a493798 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1188,6 +1188,10 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
     SCCNodes.insert(F);
   }
 
+  // Skip it if the SCC only contains optnone functions.
+  if (SCCNodes.empty())
+    return Changed;
+
   Changed |= addArgumentReturnedAttrs(SCCNodes);
   Changed |= addReadAttrs(SCCNodes, AARGetter);
   Changed |= addArgumentAttrs(SCCNodes);
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 231487923fad..6d34ab8b0d96 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -292,8 +292,7 @@ static void computeImportForFunction(
 static void ComputeImportForModule(
     const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList,
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr,
-    const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) {
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
   // Worklist contains the list of function imported in this module, for which
   // we will analyse the callees and may import further down the callgraph.
   SmallVector<EdgeInfo, 128> Worklist;
@@ -301,7 +300,7 @@ static void ComputeImportForModule(
   // Populate the worklist with the import for the functions in the current
   // module
   for (auto &GVSummary : DefinedGVSummaries) {
-    if (DeadSymbols && DeadSymbols->count(GVSummary.first)) {
+    if (!Index.isGlobalValueLive(GVSummary.second)) {
       DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
       continue;
     }
@@ -344,15 +343,14 @@ void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
     const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     StringMap<FunctionImporter::ImportMapTy> &ImportLists,
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists,
-    const DenseSet<GlobalValue::GUID> *DeadSymbols) {
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
   // For each module that has function defined, compute the import/export lists.
   for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
     auto &ImportList = ImportLists[DefinedGVSummaries.first()];
     DEBUG(dbgs() << "Computing import for Module '"
                  << DefinedGVSummaries.first() << "'\n");
     ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
-                           &ExportLists, DeadSymbols);
+                           &ExportLists);
   }
 
   // When computing imports we added all GUIDs referenced by anything
@@ -414,82 +412,71 @@ void llvm::ComputeCrossModuleImportForModule(
 #endif
 }
 
-DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols(
-    const ModuleSummaryIndex &Index,
+void llvm::computeDeadSymbols(
+    ModuleSummaryIndex &Index,
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  assert(!Index.withGlobalValueDeadStripping());
   if (!ComputeDead)
-    return DenseSet<GlobalValue::GUID>();
+    return;
   if (GUIDPreservedSymbols.empty())
     // Don't do anything when nothing is live, this is friendly with tests.
-    return DenseSet<GlobalValue::GUID>();
-  DenseSet<ValueInfo> LiveSymbols;
+    return;
+  unsigned LiveSymbols = 0;
   SmallVector<ValueInfo, 128> Worklist;
   Worklist.reserve(GUIDPreservedSymbols.size() * 2);
   for (auto GUID : GUIDPreservedSymbols) {
     ValueInfo VI = Index.getValueInfo(GUID);
     if (!VI)
       continue;
-    DEBUG(dbgs() << "Live root: " << VI.getGUID() << "\n");
-    LiveSymbols.insert(VI);
-    Worklist.push_back(VI);
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
   }
+
   // Add values flagged in the index as live roots to the worklist.
-  for (const auto &Entry : Index) {
-    bool IsLiveRoot = llvm::any_of(
-        Entry.second.SummaryList,
-        [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) {
-          return Summary->liveRoot();
-        });
-    if (!IsLiveRoot)
-      continue;
-    DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n");
-    Worklist.push_back(ValueInfo(&Entry));
-  }
+  for (const auto &Entry : Index)
+    for (auto &S : Entry.second.SummaryList)
+      if (S->isLive()) {
+        DEBUG(dbgs() << "Live root: " << Entry.first << "\n");
+        Worklist.push_back(ValueInfo(&Entry));
+        ++LiveSymbols;
+        break;
+      }
+
+  // Make value live and add it to the worklist if it was not live before.
+  // FIXME: we should only make the prevailing copy live here
+  auto visit = [&](ValueInfo VI) {
+    for (auto &S : VI.getSummaryList())
+      if (S->isLive())
+        return;
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
+    ++LiveSymbols;
+    Worklist.push_back(VI);
+  };
 
   while (!Worklist.empty()) {
     auto VI = Worklist.pop_back_val();
-
-    // FIXME: we should only make the prevailing copy live here
     for (auto &Summary : VI.getSummaryList()) {
-      for (auto Ref : Summary->refs()) {
-        if (LiveSymbols.insert(Ref).second) {
-          DEBUG(dbgs() << "Marking live (ref): " << Ref.getGUID() << "\n");
-          Worklist.push_back(Ref);
-        }
-      }
-      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
-        for (auto Call : FS->calls()) {
-          if (LiveSymbols.insert(Call.first).second) {
-            DEBUG(dbgs() << "Marking live (call): " << Call.first.getGUID()
-                         << "\n");
-            Worklist.push_back(Call.first);
-          }
-        }
-      }
+      for (auto Ref : Summary->refs())
+        visit(Ref);
+      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
+        for (auto Call : FS->calls())
+          visit(Call.first);
       if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
         auto AliaseeGUID = AS->getAliasee().getOriginalName();
         ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID);
-        if (AliaseeVI && LiveSymbols.insert(AliaseeVI).second) {
-          DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n");
-          Worklist.push_back(AliaseeVI);
-        }
+        if (AliaseeVI)
+          visit(AliaseeVI);
       }
     }
   }
-  DenseSet<GlobalValue::GUID> DeadSymbols;
-  DeadSymbols.reserve(
-      std::min(Index.size(), Index.size() - LiveSymbols.size()));
-  for (auto &Entry : Index) {
-    if (!LiveSymbols.count(ValueInfo(&Entry))) {
-      DEBUG(dbgs() << "Marking dead: " << Entry.first << "\n");
-      DeadSymbols.insert(Entry.first);
-    }
-  }
-  DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and "
-               << DeadSymbols.size() << " symbols Dead \n");
-  NumDeadSymbols += DeadSymbols.size();
-  NumLiveSymbols += LiveSymbols.size();
-  return DeadSymbols;
+  Index.setWithGlobalValueDeadStripping();
+
+  unsigned DeadSymbols = Index.size() - LiveSymbols;
+  DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+               << " symbols Dead \n");
+  NumDeadSymbols += DeadSymbols;
+  NumLiveSymbols += LiveSymbols;
 }
 
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index ca4ee92f971a..7bec50d9d25f 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1442,9 +1442,8 @@ bool LowerTypeTestsModule::lower() {
     for (auto &P : *ExportSummary) {
       for (auto &S : P.second.SummaryList) {
         auto *FS = dyn_cast<FunctionSummary>(S.get());
-        if (!FS)
+        if (!FS || !ExportSummary->isGlobalValueLive(FS))
           continue;
-        // FIXME: Only add live functions.
         for (GlobalValue::GUID G : FS->type_tests())
           for (Metadata *MD : MetadataByGUID[G])
             AddTypeIdUse(MD).IsExported = true;
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index bc0967448cdd..ea805efc66b7 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -68,6 +68,10 @@ static cl::opt<int>
                              cl::desc("Relative frequency of outline region to "
                                       "the entry block"));
 
+static cl::opt<unsigned> ExtraOutliningPenalty(
+    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
+    cl::desc("A debug option to add additional penalty to the computed one."));
+
 namespace {
 
 struct FunctionOutliningInfo {
@@ -83,7 +87,7 @@ struct FunctionOutliningInfo {
   SmallVector<BasicBlock *, 4> Entries;
   // The return block that is not included in the outlined region.
   BasicBlock *ReturnBlock;
-  // The dominating block of the region ot be outlined.
+  // The dominating block of the region to be outlined.
   BasicBlock *NonReturnBlock;
   // The set of blocks in Entries that that are predecessors to ReturnBlock
   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
@@ -407,11 +411,23 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
   if (hasProfileData(F, OI))
     return OutlineRegionRelFreq;
 
-  // When profile data is not available, we need to be very
-  // conservative in estimating the overall savings. We need to make sure
-  // the outline region relative frequency is not below the threshold
-  // specified by the option.
-  OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+  // When profile data is not available, we need to be conservative in
+  // estimating the overall savings. Static branch prediction can usually
+  // guess the branch direction right (taken/non-taken), but the guessed
+  // branch probability is usually not biased enough. In case when the
+  // outlined region is predicted to be likely, its probability needs
+  // to be made higher (more biased) to not under-estimate the cost of
+  // function outlining. On the other hand, if the outlined region
+  // is predicted to be less likely, the predicted probablity is usually
+  // higher than the actual. For instance, the actual probability of the
+  // less likely target is only 5%, but the guessed probablity can be
+  // 40%. In the latter case, there is no need for further adjustement.
+  // FIXME: add an option for this.
+  if (OutlineRegionRelFreq < BranchProbability(45, 100))
+    return OutlineRegionRelFreq;
+
+  OutlineRegionRelFreq = std::max(
+      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
 
   return OutlineRegionRelFreq;
 }
@@ -496,6 +512,26 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::Alloca:
+      continue;
+    case Instruction::GetElementPtr:
+      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+        continue;
+    default:
+      break;
+    }
+
+    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
+    if (IntrInst) {
+      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+          IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+    }
+
     if (CallInst *CI = dyn_cast<CallInst>(I)) {
       InlineCost += getCallsiteCost(CallSite(CI), DL);
       continue;
@@ -519,7 +555,13 @@ std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
     Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
     BasicBlock *OutliningCallBB) {
   // First compute the cost of the outlined region 'OI' in the original
-  // function 'F':
+  // function 'F'.
+  // FIXME: The code extractor (outliner) can now do code sinking/hoisting
+  // to reduce outlining cost. The hoisted/sunk code currently do not
+  // incur any runtime cost so it is still OK to compare the outlined
+  // function cost with the outlined region in the original function.
+  // If this ever changes, we will need to introduce new extractor api
+  // to pass the information.
   int OutlinedRegionCost = 0;
   for (BasicBlock &BB : *F) {
     if (&BB != OI->ReturnBlock &&
@@ -542,8 +584,14 @@ std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
 
   assert(OutlinedFunctionCost >= OutlinedRegionCost &&
          "Outlined function cost should be no less than the outlined region");
-  int OutliningRuntimeOverhead =
-      OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+  // The code extractor introduces a new root and exit stub blocks with
+  // additional unconditional branches. Those branches will be eliminated
+  // later with bb layout. The cost should be adjusted accordingly:
+  OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
+
+  int OutliningRuntimeOverhead = OutliningFuncCallCost +
+                                 (OutlinedFunctionCost - OutlinedRegionCost) +
+                                 ExtraOutliningPenalty;
 
   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
                          OutlinedRegionCost);
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 9fd3a9021a27..16fba32e9805 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -157,7 +157,7 @@ static cl::opt<bool>
 
 static cl::opt<bool> EnableGVNSink(
     "enable-gvn-sink", cl::init(false), cl::Hidden,
-    cl::desc("Enable the GVN sinking pass (default = on)"));
+    cl::desc("Enable the GVN sinking pass (default = off)"));
 
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2c2b7317a1c0..c0798e164c39 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4508,13 +4508,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                           Builder->CreateAnd(A, B),
                           Op1);
 
-    // ~x < ~y --> y < x
-    // ~x < cst --> ~cst < x
+    // ~X < ~Y --> Y < X
+    // ~X < C -->  X > ~C
     if (match(Op0, m_Not(m_Value(A)))) {
       if (match(Op1, m_Not(m_Value(B))))
         return new ICmpInst(I.getPredicate(), B, A);
-      if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
-        return new ICmpInst(I.getPredicate(), ConstantExpr::getNot(RHSC), A);
+
+      const APInt *C;
+      if (match(Op1, m_APInt(C)))
+        return new ICmpInst(I.getSwappedPredicate(), A,
+                            ConstantInt::get(Op1->getType(), ~(*C)));
     }
 
     Instruction *AddI = nullptr;
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ff753c20a94a..df4ee9969c02 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2087,6 +2087,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     switch (I.getNumArgOperands()) {
     case 3:
       assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode");
+      LLVM_FALLTHROUGH;
     case 2:
       CopyOp = I.getArgOperand(0);
       ConvertOp = I.getArgOperand(1);
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 325b64cd8b43..8aa40d1759de 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -57,6 +57,11 @@ static const char *const SanCovTracePCGuardName =
     "__sanitizer_cov_trace_pc_guard";
 static const char *const SanCovTracePCGuardInitName =
     "__sanitizer_cov_trace_pc_guard_init";
+static const char *const SanCov8bitCountersInitName = 
+    "__sanitizer_cov_8bit_counters_init";
+
+static const char *const SanCovGuardsSectionName = "sancov_guards";
+static const char *const SanCovCountersSectionName = "sancov_counters";
 
 static cl::opt<int> ClCoverageLevel(
     "sanitizer-coverage-level",
@@ -64,14 +69,18 @@ static cl::opt<int> ClCoverageLevel(
              "3: all blocks and critical edges"),
     cl::Hidden, cl::init(0));
 
-static cl::opt<bool> ClExperimentalTracePC("sanitizer-coverage-trace-pc",
-                                           cl::desc("Experimental pc tracing"),
-                                           cl::Hidden, cl::init(false));
+static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
+                               cl::desc("Experimental pc tracing"), cl::Hidden,
+                               cl::init(false));
 
 static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
                                     cl::desc("pc tracing with a guard"),
                                     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
+                                    cl::desc("increments 8-bit counter for every edge"),
+                                    cl::Hidden, cl::init(false));
+
 static cl::opt<bool>
     ClCMPTracing("sanitizer-coverage-trace-compares",
                  cl::desc("Tracing of CMP and similar instructions"),
@@ -123,9 +132,10 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.TraceCmp |= ClCMPTracing;
   Options.TraceDiv |= ClDIVTracing;
   Options.TraceGep |= ClGEPTracing;
-  Options.TracePC |= ClExperimentalTracePC;
+  Options.TracePC |= ClTracePC;
   Options.TracePCGuard |= ClTracePCGuard;
-  if (!Options.TracePCGuard && !Options.TracePC)
+  Options.Inline8bitCounters |= ClInline8bitCounters;
+  if (!Options.TracePCGuard && !Options.TracePC && !Options.Inline8bitCounters)
     Options.TracePCGuard = true; // TracePCGuard is default.
   Options.NoPrune |= !ClPruneBlocks;
   return Options;
@@ -159,11 +169,22 @@ private:
   void InjectTraceForSwitch(Function &F,
                             ArrayRef<Instruction *> SwitchTraceTargets);
   bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
-  void CreateFunctionGuardArray(size_t NumGuards, Function &F);
+  GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
+                                                    Function &F, Type *Ty,
+                                                    const char *Section);
+  void CreateFunctionLocalArrays(size_t NumGuards, Function &F);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx);
-  StringRef getSanCovTracePCGuardSection() const;
-  StringRef getSanCovTracePCGuardSectionStart() const;
-  StringRef getSanCovTracePCGuardSectionEnd() const;
+  void CreateInitCallForSection(Module &M, const char *InitFunctionName,
+                                Type *Ty, const std::string &Section);
+
+  void SetNoSanitizeMetadata(Instruction *I) {
+    I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+                   MDNode::get(*C, None));
+  }
+
+  std::string getSectionName(const std::string &Section) const;
+  std::string getSectionStart(const std::string &Section) const;
+  std::string getSectionEnd(const std::string &Section) const;
   Function *SanCovTracePCIndir;
   Function *SanCovTracePC, *SanCovTracePCGuard;
   Function *SanCovTraceCmpFunction[4];
@@ -171,20 +192,48 @@ private:
   Function *SanCovTraceGepFunction;
   Function *SanCovTraceSwitchFunction;
   InlineAsm *EmptyAsm;
-  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy;
+  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
+      *Int8Ty, *Int8PtrTy;
   Module *CurModule;
   Triple TargetTriple;
   LLVMContext *C;
   const DataLayout *DL;
 
   GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
-  bool HasSancovGuardsSection;
+  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
 
   SanitizerCoverageOptions Options;
 };
 
 } // namespace
 
+void SanitizerCoverageModule::CreateInitCallForSection(
+    Module &M, const char *InitFunctionName, Type *Ty,
+    const std::string &Section) {
+  IRBuilder<> IRB(M.getContext());
+  Function *CtorFunc;
+  GlobalVariable *SecStart =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
+                         getSectionStart(Section));
+  SecStart->setVisibility(GlobalValue::HiddenVisibility);
+  GlobalVariable *SecEnd =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                         nullptr, getSectionEnd(Section));
+  SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+
+  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty},
+      {IRB.CreatePointerCast(SecStart, Ty), IRB.CreatePointerCast(SecEnd, Ty)});
+
+  if (TargetTriple.supportsCOMDAT()) {
+    // Use comdat to dedup CtorFunc.
+    CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName));
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+  } else {
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+  }
+}
+
 bool SanitizerCoverageModule::runOnModule(Module &M) {
   if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
     return false;
@@ -192,15 +241,18 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   DL = &M.getDataLayout();
   CurModule = &M;
   TargetTriple = Triple(M.getTargetTriple());
-  HasSancovGuardsSection = false;
+  FunctionGuardArray = nullptr;
+  Function8bitCounterArray = nullptr;
   IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   IntptrPtrTy = PointerType::getUnqual(IntptrTy);
   Type *VoidTy = Type::getVoidTy(*C);
   IRBuilder<> IRB(*C);
   Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
   Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
   Int64Ty = IRB.getInt64Ty();
   Int32Ty = IRB.getInt32Ty();
+  Int8Ty = IRB.getInt8Ty();
 
   SanCovTracePCIndir = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy));
@@ -243,34 +295,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   for (auto &F : M)
     runOnFunction(F);
 
-  // Create variable for module (compilation unit) name
-  if (Options.TracePCGuard) {
-    if (HasSancovGuardsSection) {
-      Function *CtorFunc;
-      GlobalVariable *SecStart = new GlobalVariable(
-          M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr,
-          getSanCovTracePCGuardSectionStart());
-      SecStart->setVisibility(GlobalValue::HiddenVisibility);
-      GlobalVariable *SecEnd = new GlobalVariable(
-          M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr,
-          getSanCovTracePCGuardSectionEnd());
-      SecEnd->setVisibility(GlobalValue::HiddenVisibility);
-
-      std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
-          M, SanCovModuleCtorName, SanCovTracePCGuardInitName,
-          {Int32PtrTy, Int32PtrTy},
-          {IRB.CreatePointerCast(SecStart, Int32PtrTy),
-            IRB.CreatePointerCast(SecEnd, Int32PtrTy)});
-
-      if (TargetTriple.supportsCOMDAT()) {
-        // Use comdat to dedup CtorFunc.
-        CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName));
-        appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
-      } else {
-        appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
-      }
-    }
-  }
+  if (FunctionGuardArray)
+    CreateInitCallForSection(M, SanCovTracePCGuardInitName, Int32PtrTy,
+                             SanCovGuardsSectionName);
+  if (Function8bitCounterArray)
+    CreateInitCallForSection(M, SanCov8bitCountersInitName, Int8PtrTy,
+                             SanCovCountersSectionName);
+
   return true;
 }
 
@@ -393,17 +424,26 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   InjectTraceForGep(F, GepTraceTargets);
   return true;
 }
-void SanitizerCoverageModule::CreateFunctionGuardArray(size_t NumGuards,
-                                                       Function &F) {
-  if (!Options.TracePCGuard) return;
-  HasSancovGuardsSection = true;
-  ArrayType *ArrayOfInt32Ty = ArrayType::get(Int32Ty, NumGuards);
-  FunctionGuardArray = new GlobalVariable(
-      *CurModule, ArrayOfInt32Ty, false, GlobalVariable::PrivateLinkage,
-      Constant::getNullValue(ArrayOfInt32Ty), "__sancov_gen_");
+
+GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
+    size_t NumElements, Function &F, Type *Ty, const char *Section) {
+  ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
+  auto Array = new GlobalVariable(
+      *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(ArrayTy), "__sancov_gen_");
   if (auto Comdat = F.getComdat())
-    FunctionGuardArray->setComdat(Comdat);
-  FunctionGuardArray->setSection(getSanCovTracePCGuardSection());
+    Array->setComdat(Comdat);
+  Array->setSection(getSectionName(Section));
+  return Array;
+}
+void SanitizerCoverageModule::CreateFunctionLocalArrays(size_t NumGuards,
+                                                       Function &F) {
+  if (Options.TracePCGuard)
+    FunctionGuardArray = CreateFunctionLocalArrayInSection(
+        NumGuards, F, Int32Ty, SanCovGuardsSectionName);
+  if (Options.Inline8bitCounters)
+    Function8bitCounterArray = CreateFunctionLocalArrayInSection(
+        NumGuards, F, Int8Ty, SanCovCountersSectionName);
 }
 
 bool SanitizerCoverageModule::InjectCoverage(Function &F,
@@ -413,11 +453,11 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F,
   case SanitizerCoverageOptions::SCK_None:
     return false;
   case SanitizerCoverageOptions::SCK_Function:
-    CreateFunctionGuardArray(1, F);
+    CreateFunctionLocalArrays(1, F);
     InjectCoverageAtBlock(F, F.getEntryBlock(), 0);
     return true;
   default: {
-    CreateFunctionGuardArray(AllBlocks.size(), F);
+    CreateFunctionLocalArrays(AllBlocks.size(), F);
     for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
       InjectCoverageAtBlock(F, *AllBlocks[i], i);
     return true;
@@ -436,7 +476,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
   if (IndirCalls.empty())
     return;
-  assert(Options.TracePC || Options.TracePCGuard);
+  assert(Options.TracePC || Options.TracePCGuard || Options.Inline8bitCounters);
   for (auto I : IndirCalls) {
     IRBuilder<> IRB(I);
     CallSite CS(I);
@@ -564,8 +604,8 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   if (Options.TracePC) {
     IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
     IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
-  } else {
-    assert(Options.TracePCGuard);
+  }
+  if (Options.TracePCGuard) {
     auto GuardPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
                       ConstantInt::get(IntptrTy, Idx * 4)),
@@ -573,26 +613,39 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     IRB.CreateCall(SanCovTracePCGuard, GuardPtr);
     IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
   }
+  if (Options.Inline8bitCounters) {
+    auto CounterPtr = IRB.CreateGEP(
+        Function8bitCounterArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(CounterPtr);
+    auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
+    auto Store = IRB.CreateStore(Inc, CounterPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
+  }
 }
 
-StringRef SanitizerCoverageModule::getSanCovTracePCGuardSection() const {
+std::string
+SanitizerCoverageModule::getSectionName(const std::string &Section) const {
   if (TargetTriple.getObjectFormat() == Triple::COFF)
     return ".SCOV$M";
   if (TargetTriple.isOSBinFormatMachO())
-    return "__DATA,__sancov_guards";
-  return "__sancov_guards";
+    return "__DATA,__" + Section;
+  return "__" + Section;
 }
 
-StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionStart() const {
+std::string
+SanitizerCoverageModule::getSectionStart(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatMachO())
-    return "\1section$start$__DATA$__sancov_guards";
-  return "__start___sancov_guards";
+    return "\1section$start$__DATA$__" + Section;
+  return "__start___" + Section;
 }
 
-StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionEnd() const {
+std::string
+SanitizerCoverageModule::getSectionEnd(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatMachO())
-    return "\1section$end$__DATA$__sancov_guards";
-  return "__stop___sancov_guards";
+    return "\1section$end$__DATA$__" + Section;
+  return "__stop___" + Section;
 }
 
 
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 3953198fe605..9a7882211bac 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1823,6 +1823,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
     // An IV counter must preserve its type.
     if (IncI->getNumOperands() == 2)
       break;
+    LLVM_FALLTHROUGH;
   default:
     return nullptr;
   }
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 930696b036c0..7d8da9b453f9 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -83,6 +84,149 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   return true;
 }
 
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+  Value &Arg = *Expect->getArgOperand(0);
+  ConstantInt *ExpectedValue = cast<ConstantInt>(Expect->getArgOperand(1));
+  const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+  // Walk up in backward a list of instructions that
+  // have 'copy' semantics by 'stripping' the copies
+  // until a PHI node or an instruction of unknown kind
+  // is reached. Negation via xor is also handled.
+  //
+  //       C = PHI(...);
+  //       B = C;
+  //       A = B;
+  //       D = __builtin_expect(A, 0);
+  //
+  Value *V = &Arg;
+  SmallVector<Instruction *, 4> Operations;
+  while (!isa<PHINode>(V)) {
+    if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+      V = ZExt->getOperand(0);
+      Operations.push_back(ZExt);
+      continue;
+    }
+
+    if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+      V = SExt->getOperand(0);
+      Operations.push_back(SExt);
+      continue;
+    }
+
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+      return;
+
+    ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (!CInt)
+      return;
+
+    V = BinOp->getOperand(0);
+    Operations.push_back(BinOp);
+  }
+
+  // Executes the recorded operations on input 'Value'.
+  auto ApplyOperations = [&](const APInt &Value) {
+    APInt Result = Value;
+    for (auto Op : llvm::reverse(Operations)) {
+      switch (Op->getOpcode()) {
+      case Instruction::Xor:
+        Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+        break;
+      case Instruction::ZExt:
+        Result = Result.zext(Op->getType()->getIntegerBitWidth());
+        break;
+      case Instruction::SExt:
+        Result = Result.sext(Op->getType()->getIntegerBitWidth());
+        break;
+      default:
+        llvm_unreachable("Unexpected operation");
+      }
+    }
+    return Result;
+  };
+
+  auto *PhiDef = dyn_cast<PHINode>(V);
+
+  // Get the first dominating conditional branch of the operand
+  // i's incoming block.
+  auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+    BasicBlock *BB = PhiDef->getIncomingBlock(i);
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (BI && BI->isConditional())
+      return BI;
+    BB = BB->getSinglePredecessor();
+    if (!BB)
+      return nullptr;
+    BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      return nullptr;
+    return BI;
+  };
+
+  // Now walk through all Phi operands to find phi oprerands with values
+  // conflicting with the expected phi output value. Any such operand
+  // indicates the incoming edge to that operand is unlikely.
+  for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+    Value *PhiOpnd = PhiDef->getIncomingValue(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+    if (!CI)
+      continue;
+
+    // Not an interesting case when IsUnlikely is false -- we can not infer
+    // anything useful when the operand value matches the expected phi
+    // output.
+    if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+      continue;
+
+    BranchInst *BI = GetDomConditional(i);
+    if (!BI)
+      continue;
+
+    MDBuilder MDB(PhiDef->getContext());
+
+    // There are two situations in which an operand of the PhiDef comes
+    // from a given successor of a branch instruction BI.
+    // 1) When the incoming block of the operand is the successor block;
+    // 2) When the incoming block is BI's enclosing block and the
+    // successor is the PhiDef's enclosing block.
+    //
+    // Returns true if the operand which comes from OpndIncomingBB
+    // comes from outgoing edge of BI that leads to Succ block.
+    auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+    auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+      if (OpndIncomingBB == Succ)
+        // If this successor is the incoming block for this
+        // Phi operand, then this successor does lead to the Phi.
+        return true;
+      if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+        // Otherwise, if the edge is directly from the branch
+        // to the Phi, this successor is the one feeding this
+        // Phi operand.
+        return true;
+      return false;
+    };
+
+    if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+      BI->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight));
+    else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+      BI->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight));
+  }
+}
+
 // Handle both BranchInst and SelectInst.
 template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
@@ -99,25 +243,31 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
   ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
   CmpInst::Predicate Predicate;
-  uint64_t ValueComparedTo = 0;
+  ConstantInt *CmpConstOperand = nullptr;
   if (!CmpI) {
     CI = dyn_cast<CallInst>(BSI.getCondition());
     Predicate = CmpInst::ICMP_NE;
-    ValueComparedTo = 0;
   } else {
     Predicate = CmpI->getPredicate();
     if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
       return false;
-    ConstantInt *CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+
+    CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
     if (!CmpConstOperand)
       return false;
-    ValueComparedTo = CmpConstOperand->getZExtValue();
     CI = dyn_cast<CallInst>(CmpI->getOperand(0));
   }
 
   if (!CI)
     return false;
 
+  uint64_t ValueComparedTo = 0;
+  if (CmpConstOperand) {
+    if (CmpConstOperand->getBitWidth() > 64)
+      return false;
+    ValueComparedTo = CmpConstOperand->getZExtValue();
+  }
+
   Function *Fn = CI->getCalledFunction();
   if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
     return false;
@@ -181,6 +331,10 @@ static bool lowerExpectIntrinsic(Function &F) {
 
       Function *Fn = CI->getCalledFunction();
       if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+        // Before erasing the llvm.expect, walk backward to find
+        // phi that define llvm.expect's first arg, and
+        // infer branch probability:
+        handlePhiDef(CI);
         Value *Exp = CI->getArgOperand(0);
         CI->replaceAllUsesWith(Exp);
         CI->eraseFromParent();
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 77b2bd84f9b6..350b50ffcdd4 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Rewrite an existing set of gc.statepoints such that they make potential
-// relocations performed by the garbage collector explicit in the IR.
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
 //
 //===----------------------------------------------------------------------===//
 
@@ -2094,9 +2094,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   // live in the IR.  We'll remove all of these when done.
   SmallVector<CallInst *, 64> Holders;
 
-  // Insert a dummy call with all of the arguments to the vm_state we'll need
-  // for the actual safepoint insertion.  This ensures reference arguments in
-  // the deopt argument list are considered live through the safepoint (and
+  // Insert a dummy call with all of the deopt operands we'll need for the
+  // actual safepoint insertion as arguments.  This ensures reference operands
+  // in the deopt argument list are considered live through the safepoint (and
   // thus makes sure they get relocated.)
   for (CallSite CS : ToUpdate) {
     SmallVector<Value *, 64> DeoptValues;
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 6e113bccff94..fb1b5813fd79 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -3698,7 +3698,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     int Idx = 0, Size = Offsets.Splits.size();
     for (;;) {
       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+      auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
 
       // Either lookup a split load or create one.
       LoadInst *PLoad;
@@ -3709,7 +3710,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
         PLoad = IRB.CreateAlignedLoad(
             getAdjustedPtr(IRB, DL, LoadBasePtr,
                            APInt(DL.getPointerSizeInBits(), PartOffset),
-                           PartPtrTy, LoadBasePtr->getName() + "."),
+                           LoadPartPtrTy, LoadBasePtr->getName() + "."),
             getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
             LI->getName());
       }
@@ -3719,7 +3720,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       StoreInst *PStore = IRB.CreateAlignedStore(
           PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
                                 APInt(DL.getPointerSizeInBits(), PartOffset),
-                                PartPtrTy, StoreBasePtr->getName() + "."),
+                                StorePartPtrTy, StoreBasePtr->getName() + "."),
           getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
 
       // Now build a new slice for the alloca.
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 1ec3d0d49637..1c1a75c111e9 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -37,10 +37,10 @@
 using namespace llvm;
 
 /// See comments in Cloning.h.
-BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
-                                  ValueToValueMapTy &VMap,
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
-                                  ClonedCodeInfo *CodeInfo) {
+                                  ClonedCodeInfo *CodeInfo,
+                                  DebugInfoFinder *DIFinder) {
   DenseMap<const MDNode *, MDNode *> Cache;
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
@@ -50,10 +50,11 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
   // Loop over all instructions, and copy them over.
   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
        II != IE; ++II) {
+
+    if (DIFinder && F->getParent() && II->getDebugLoc())
+      DIFinder->processLocation(*F->getParent(), II->getDebugLoc().get());
+
     Instruction *NewInst = II->clone();
-    if (F && F->getSubprogram())
-      DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(),
-                                  F->getSubprogram(), Cache);
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
     NewBB->getInstList().push_back(NewInst);
@@ -122,31 +123,38 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
                          OldAttrs.getRetAttributes(), NewArgAttrs));
 
+  bool MustCloneSP =
+      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  DISubprogram *SP = OldFunc->getSubprogram();
+  if (SP) {
+    assert(!MustCloneSP || ModuleLevelChanges);
+    // Add mappings for some DebugInfo nodes that we don't want duplicated
+    // even if they're distinct.
+    auto &MD = VMap.MD();
+    MD[SP->getUnit()].reset(SP->getUnit());
+    MD[SP->getType()].reset(SP->getType());
+    MD[SP->getFile()].reset(SP->getFile());
+    // If we're not cloning into the same module, no need to clone the
+    // subprogram
+    if (!MustCloneSP)
+      MD[SP].reset(SP);
+  }
+
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc->getAllMetadata(MDs);
   for (auto MD : MDs) {
-    MDNode *NewMD;
-    bool MustCloneSP =
-        (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() &&
-         OldFunc->getParent() == NewFunc->getParent());
-    if (MustCloneSP) {
-      auto *SP = cast<DISubprogram>(MD.second);
-      NewMD = DISubprogram::getDistinct(
-          NewFunc->getContext(), SP->getScope(), SP->getName(),
-          SP->getLinkageName(), SP->getFile(), SP->getLine(), SP->getType(),
-          SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
-          SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
-          SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
-          SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(),
-          SP->getVariables(), SP->getThrownTypes());
-    } else
-      NewMD =
-          MapMetadata(MD.second, VMap,
-                      ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                      TypeMapper, Materializer);
-    NewFunc->addMetadata(MD.first, *NewMD);
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
   }
 
+  // When we remap instructions, we want to avoid duplicating inlined
+  // DISubprograms, so record all subprograms we find as we duplicate
+  // instructions and then freeze them in the MD map.
+  DebugInfoFinder DIFinder;
+
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
@@ -156,7 +164,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     const BasicBlock &BB = *BI;
 
     // Create a new basic block and copy instructions into it!
-    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo);
+    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
+                                      SP ? &DIFinder : nullptr);
 
     // Add basic block mapping.
     VMap[&BB] = CBB;
@@ -178,6 +187,12 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       Returns.push_back(RI);
   }
 
+  for (DISubprogram *ISP : DIFinder.subprograms()) {
+    if (ISP != SP) {
+      VMap.MD()[ISP].reset(ISP);
+    }
+  }
+
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
   for (Function::iterator BB =
@@ -226,7 +241,7 @@ Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
     }
 
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns, "",
+  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
                     CodeInfo);
 
   return NewF;
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8b9a64c220cc..799eef21dc4e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4779,6 +4779,7 @@ void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
       scalarizeInstruction(&I, true);
       break;
     }
+    LLVM_FALLTHROUGH;
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -7396,6 +7397,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       // likely.
       return Cost / getReciprocalPredBlockProb();
     }
+    LLVM_FALLTHROUGH;
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e6f78e6b94a3..d1349535f298 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -259,6 +259,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
       return (CI->getArgOperand(1) == Scalar);
     }
+    LLVM_FALLTHROUGH;
   }
   default:
     return false;
@@ -4749,56 +4750,18 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   return nullptr;
 }
 
-namespace {
-/// Tracks instructons and its children.
-class WeakTrackingVHWithLevel final : public CallbackVH {
-  /// Operand index of the instruction currently beeing analized.
-  unsigned Level = 0;
-  /// Is this the instruction that should be vectorized, or are we now
-  /// processing children (i.e. operands of this instruction) for potential
-  /// vectorization?
-  bool IsInitial = true;
-
-public:
-  explicit WeakTrackingVHWithLevel() = default;
-  WeakTrackingVHWithLevel(Value *V) : CallbackVH(V){};
-  /// Restart children analysis each time it is repaced by the new instruction.
-  void allUsesReplacedWith(Value *New) override {
-    setValPtr(New);
-    Level = 0;
-    IsInitial = true;
-  }
-  /// Check if the instruction was not deleted during vectorization.
-  bool isValid() const { return !getValPtr(); }
-  /// Is the istruction itself must be vectorized?
-  bool isInitial() const { return IsInitial; }
-  /// Try to vectorize children.
-  void clearInitial() { IsInitial = false; }
-  /// Are all children processed already?
-  bool isFinal() const {
-    assert(getValPtr() &&
-           (isa<Instruction>(getValPtr()) &&
-            cast<Instruction>(getValPtr())->getNumOperands() >= Level));
-    return getValPtr() &&
-           cast<Instruction>(getValPtr())->getNumOperands() == Level;
-  }
-  /// Get next child operation.
-  Value *nextOperand() {
-    assert(getValPtr() && isa<Instruction>(getValPtr()) &&
-           cast<Instruction>(getValPtr())->getNumOperands() > Level);
-    return cast<Instruction>(getValPtr())->getOperand(Level++);
-  }
-  virtual ~WeakTrackingVHWithLevel() = default;
-};
-} // namespace
-
-/// \brief Attempt to reduce a horizontal reduction.
-/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators Root in a basic block BB, then check
-/// if it can be done.
-/// \returns true if a horizontal reduction was matched and reduced.
-/// \returns false if a horizontal reduction was not matched.
-static bool canBeVectorized(
+/// Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding the phi node \a P
+/// with reduction operators \a Root (or one of its operands) in a basic block
+/// \a BB, then check if it can be done. If horizontal reduction is not found
+/// and root instruction is a binary operation, vectorization of the operands is
+/// attempted.
+/// \returns true if a horizontal reduction was matched and reduced or operands
+/// of one of the binary instruction were vectorized.
+/// \returns false if a horizontal reduction was not matched (or not possible)
+/// or no vectorization of any binary operation feeding \a Root instruction was
+/// performed.
+static bool tryToVectorizeHorReductionOrInstOperands(
     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
     TargetTransformInfo *TTI,
     const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
@@ -4810,56 +4773,62 @@ static bool canBeVectorized(
 
   if (Root->getParent() != BB)
     return false;
-  SmallVector<WeakTrackingVHWithLevel, 8> Stack(1, Root);
+  // Start analysis starting from Root instruction. If horizontal reduction is
+  // found, try to vectorize it. If it is not a horizontal reduction or
+  // vectorization is not possible or not effective, and currently analyzed
+  // instruction is a binary operation, try to vectorize the operands, using
+  // pre-order DFS traversal order. If the operands were not vectorized, repeat
+  // the same procedure considering each operand as a possible root of the
+  // horizontal reduction.
+  // Interrupt the process if the Root instruction itself was vectorized or all
+  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+  SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
   SmallSet<Value *, 8> VisitedInstrs;
   bool Res = false;
   while (!Stack.empty()) {
-    Value *V = Stack.back();
-    if (!V) {
-      Stack.pop_back();
+    Value *V;
+    unsigned Level;
+    std::tie(V, Level) = Stack.pop_back_val();
+    if (!V)
       continue;
-    }
     auto *Inst = dyn_cast<Instruction>(V);
-    if (!Inst || isa<PHINode>(Inst)) {
-      Stack.pop_back();
+    if (!Inst || isa<PHINode>(Inst))
       continue;
-    }
-    if (Stack.back().isInitial()) {
-      Stack.back().clearInitial();
-      if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
-        HorizontalReduction HorRdx;
-        if (HorRdx.matchAssociativeReduction(P, BI)) {
-          if (HorRdx.tryToReduce(R, TTI)) {
-            Res = true;
-            P = nullptr;
-            continue;
-          }
-        }
-        if (P) {
-          Inst = dyn_cast<Instruction>(BI->getOperand(0));
-          if (Inst == P)
-            Inst = dyn_cast<Instruction>(BI->getOperand(1));
-          if (!Inst) {
-            P = nullptr;
-            continue;
-          }
+    if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+      HorizontalReduction HorRdx;
+      if (HorRdx.matchAssociativeReduction(P, BI)) {
+        if (HorRdx.tryToReduce(R, TTI)) {
+          Res = true;
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
         }
       }
-      P = nullptr;
-      if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
-        Res = true;
-        continue;
+      if (P) {
+        Inst = dyn_cast<Instruction>(BI->getOperand(0));
+        if (Inst == P)
+          Inst = dyn_cast<Instruction>(BI->getOperand(1));
+        if (!Inst) {
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
       }
     }
-    if (Stack.back().isFinal()) {
-      Stack.pop_back();
+    // Set P to nullptr to avoid re-analysis of phi node in
+    // matchAssociativeReduction function unless this is the root node.
+    P = nullptr;
+    if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+      Res = true;
       continue;
     }
 
-    if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
-      if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
-          Stack.size() < RecursionMaxDepth)
-        Stack.push_back(NextV);
+    // Try to vectorize operands.
+    if (++Level < RecursionMaxDepth)
+      for (auto *Op : Inst->operand_values())
+        Stack.emplace_back(Op, Level);
   }
   return Res;
 }
@@ -4876,10 +4845,10 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
   if (!isa<BinaryOperator>(I))
     P = nullptr;
   // Try to match and vectorize a horizontal reduction.
-  return canBeVectorized(P, I, BB, R, TTI,
-                         [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
-                           return tryToVectorize(BI, R);
-                         });
+  return tryToVectorizeHorReductionOrInstOperands(
+      P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+        return tryToVectorize(BI, R);
+      });
 }
 
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index ef56fa1b9367..6793a49a2ddc 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -188,6 +188,7 @@ else() # if this is included from LLVM's CMake
     else()
       get_cmake_property(variableNames VARIABLES)
       add_custom_target(builtins)
+      add_custom_target(install-builtins)
       foreach(target ${LLVM_BUILTIN_TARGETS})
         string(REPLACE "-" ";" builtin_target_list ${target})
         foreach(item ${builtin_target_list})
@@ -218,6 +219,7 @@ else() # if this is included from LLVM's CMake
                                USE_TOOLCHAIN
                                ${EXTRA_ARGS})
         add_dependencies(builtins builtins-${target})
+        add_dependencies(install-builtins install-builtins-${target})
       endforeach()
     endif()
     set(deps builtins)
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
index 0557008ceb4f..b3e41c7751c5 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
@@ -10,18 +10,27 @@
   entry:
     ret void
   }
+
+  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+
+  attributes #1 = { nounwind }
+
 ...
 
 ---
 name:            test_constant
 registers:
   - { id: 0, class: _ }
+  - { id: 1, class: _ }
 body: |
   bb.0.entry:
     ; CHECK-LABEL: name: test_constant
     ; CHECK: %0(s32) = G_CONSTANT i32 5
+    ; CHECK: %1(s1) = G_CONSTANT i1 false
 
     %0(s32) = G_CONSTANT i32 5
+    %1(s1) = G_CONSTANT i1 0
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp.f32), %0, %0, %0, %0, %0, %0, %1, %1;
 ...
 
 ---
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index e245e4296df2..d8f9e4f51ff4 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -34,8 +34,6 @@ end:
 ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
 
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll
index 68b77ea3490e..662ea37a2b99 100644
--- a/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -19,9 +19,8 @@
 
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
 ; GCN: ds_write_b32
-; GCN: s_waitcnt
 
-; GCN-NEXT: [[BB5]]
+; GCN: [[BB5]]
 ; GCN: s_or_b64 exec, exec
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index 263059d4a6ed..d3f835bdf163 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -223,7 +223,6 @@ bb3:
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 ; GCN: buffer_store_dword [[BB2_K]]
-; GCN: s_waitcnt vmcnt(0)
 
 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
 ; GCN-NEXT: s_getpc_b64 vcc
@@ -393,7 +392,6 @@ bb3:
 
 ; GCN-NEXT: ; BB#2: ; %if_uniform
 ; GCN: buffer_store_dword
-; GCN: s_waitcnt vmcnt(0)
 
 ; GCN-NEXT: [[ENDIF]]: ; %endif
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
index 66148a43a271..caba83c50428 100644
--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspa
 ; FIXME: Why isn't this being folded as a constant?
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
-; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
+; GCN: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, [[K]]
 define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -99,11 +99,9 @@ define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrsp
   ret void
 }
 
-; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
-
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
+; GCN: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, [[K]]
 define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
@@ -702,7 +700,7 @@ define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double ad
 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
-; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
+; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
 define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index d3e6c11ef908..79d9b1691878 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -37,22 +37,21 @@
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN: mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: {{^}}BB{{[0-9]+}}_1: ; %if
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
+; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
-; GCN: s_waitcnt vmcnt(0)
 
 ; Spill val register
 ; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
 ; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0)
 
 ; VMEM: [[ENDIF]]:
 ; Reload and restore exec mask
+; VGPR: s_waitcnt lgkmcnt(0)
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -119,7 +118,6 @@ endif:
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: ; mask branch [[END:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execz [[END]]
 
@@ -130,7 +128,6 @@ endif:
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
 ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_cbranch_vccnz [[LOOP]]
 
 
@@ -197,7 +194,6 @@ end:
 ; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, [[CMP0]]
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 
 ; FIXME: It makes no sense to put this skip here
 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
@@ -235,7 +231,6 @@ end:
 
 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF]]
 
@@ -245,14 +240,12 @@ end:
 ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
 ; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index b18ae353ca4c..fab1f8d12253 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -120,8 +120,7 @@ entry:
 ; FIXME: The waitcnt for the argument load can go after the loop
 ; IDXMODE: s_set_gpr_idx_on 0, src0
 ; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
-; GCN: s_waitcnt lgkmcnt(0)
-
+; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
 
 ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
@@ -250,8 +249,6 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
 
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_waitcnt lgkmcnt(0)
-
 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
 
@@ -290,7 +287,6 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on 0, dst
 
 ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_waitcnt lgkmcnt(0)
 
 ; The offset depends on the register that holds the first element of the vector.
 ; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
@@ -330,9 +326,9 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on 0, src0
 
 ; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_waitcnt vmcnt(0)
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
 
@@ -411,6 +407,7 @@ bb2:
 ; IDXMODE: s_set_gpr_idx_on 0, dst
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
 ; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
 
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
index 73482756b8c8..3caffc342c7e 100644
--- a/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -4,8 +4,8 @@
 ; SI-LABEL: {{^}}infinite_loop:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
 ; SI: BB0_1:
+; SI: s_waitcnt lgkmcnt(0)
 ; SI: buffer_store_dword [[REG]]
-; SI: s_waitcnt vmcnt(0) expcnt(0)
 ; SI: s_branch BB0_1
 define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 350dd38ef583..1edccff3bf15 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -421,11 +421,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
-; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 ; GCN: flat_load_dword [[IDX:v[0-9]+]]
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
-; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@@ -449,11 +448,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
-; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 ; GCN: flat_load_dword [[IDX:v[0-9]+]]
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
-; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG:   v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
index 555a1d23ebe9..e50455f6f9a1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -58,7 +58,7 @@ main_body:
 ;
 ;CHECK-LABEL: {{^}}buffer_store_wait:
 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: s_waitcnt expcnt(0)
 ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
index 5ae255c7a26c..81597516d5f2 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -58,7 +58,7 @@ main_body:
 ;
 ;CHECK-LABEL: {{^}}buffer_store_wait:
 ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: s_waitcnt expcnt(0)
 ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index 02642142ae2c..d97644262016 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -5,7 +5,6 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
 
 ; FUNC-LABEL: {{^}}ds_swizzle:
 ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11")
-; CHECK: s_waitcnt lgkmcnt
 define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index c74c0fa15855..a289f7b0cfb1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -130,7 +130,7 @@ main_body:
 ;
 ; GCN-LABEL: {{^}}image_store_wait:
 ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-; GCN: s_waitcnt vmcnt(0) expcnt(0)
+; GCN: s_waitcnt expcnt(0)
 ; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 055dddbfa8af..9a27809f37bb 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa-opencl -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-OPENCL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s
 ; RUN: llc -mtriple=amdgcn-mesa-unknown -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa-amdgiz -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-NOENV %s
+; RUN: llc -mtriple=amdgcn--amdhsa-amdgizcl -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL,HSA-OPENCL %s
 
 ; ALL-LABEL: {{^}}test:
 ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index ef9cda142850..3d815cca5be2 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -1,10 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 -check-prefix=NOAUTO %s
+; RUN: llc -march=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 -check-prefix=AUTO %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=NOAUTO %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=AUTO %s
 
 ; GCN-LABEL: {{^}}test_barrier:
 ; GFX8: buffer_store_dword
 ; GFX9: flat_store_dword
-; GCN: s_waitcnt
+; NOAUTO: s_waitcnt
+; AUTO-NOT: s_waitcnt
 ; GCN: s_barrier
 define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index b488565c6b3a..224b2ed72e3b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @test_s_dcache_inv() #0 {
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index a3a5c329f411..f96d5db5794a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @test_s_dcache_inv_vol() #0 {
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index 909a85dda3e8..99b651350439 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_s_dcache_wb() #0 {
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index 217bf97c41a4..844fcecdb48b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_s_dcache_wb_vol() #0 {
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
 define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
-  call void @llvm.amdgcn.s.waitcnt(i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 127)
   br label %end
 
 end:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index 6083ec885a86..ee58d359a935 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -18,8 +18,8 @@ define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float>
 ;
 ; CHECK-LABEL: {{^}}test2:
 ; CHECK: image_load
-; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
-; CHECK: s_waitcnt
+; CHECK-NEXT: s_waitcnt
+; CHECK: s_waitcnt vmcnt(0){{$}}
 ; CHECK-NEXT: image_store
 define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
   %t = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 9d0b6b395996..82c27f204a47 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -362,6 +362,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 
 ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT: ; return
 
 define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
diff --git a/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll b/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
new file mode 100644
index 000000000000..bced3c408c52
--- /dev/null
+++ b/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: @volatile_load
+; GCN:  s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
+; GCN:  v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; GCN:  v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; GCN:  flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
+  %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
+  store i32 %tmp18, i32 addrspace(1)* %tmp26, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index f2fbacbab82e..e7a05d94cdc4 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -65,7 +65,6 @@ ret.bb:                                          ; preds = %else, %main_body
 
 ; GCN-NEXT:  ; %unreachable.bb
 ; GCN: ds_write_b32
-; GCN: s_waitcnt
 ; GCN: ; divergent unreachable
 
 ; GCN: ; %ret.bb
@@ -73,6 +72,7 @@ ret.bb:                                          ; preds = %else, %main_body
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index cb010cf15300..5b0d5274d5bc 100644
--- a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -9,7 +9,6 @@
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
 ; GCN: ; divergent unreachable
-; GCN: s_waitcnt
 
 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_or_b64 exec, exec
@@ -38,7 +37,6 @@ ret:
 ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
 ; GCN: ; divergent unreachable
-; GCN: s_waitcnt
 
 ; GCN: [[RETURN]]:
 ; GCN-NEXT: s_or_b64 exec, exec
@@ -66,7 +64,6 @@ unreachable:
 
 ; GCN: [[UNREACHABLE]]:
 ; GCN: ds_write_b32
-; GCN: s_waitcnt
 define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
 bb:
   %tmp63 = icmp eq i32 %arg0, 32
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index 343211b0219c..333113e8a9b6 100644
--- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -5,7 +5,7 @@
 ; GCN-FUNC: {{^}}vccz_workaround:
 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0{{$}}
-; GCN: s_waitcnt lgkmcnt(0)
+; VCCZ-BUG: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VCCZ-BUG: s_mov_b64 vcc, vcc
 ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
 ; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 8f1aebfe9ceb..7e8fa118c2c2 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -18,13 +18,11 @@
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
 ; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
-; TOVMEM: s_waitcnt vmcnt(0)
 
 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; TOSMEM-NOT: [[M0_COPY]]
 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
-; TOSMEM: s_waitcnt lgkmcnt(0)
 
 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index cf9e714ea6d3..1d407ea9bcda 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
+; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 85a8929ebe58..a67f36d0a7e8 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -11,7 +11,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI: v_cmp_lt_i32_e32 vcc, 0,
 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
-; SI-NEXT: s_waitcnt lgkmcnt(0)
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
@@ -72,7 +71,6 @@ end:
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI-NEXT: s_waitcnt
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
@@ -101,7 +99,6 @@ exit:
 
 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI-NEXT: s_waitcnt
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
@@ -132,7 +129,6 @@ exit:
 
 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
 ; SI: ds_write_b32
-; SI: s_waitcnt
 
 ; SI-NEXT: {{^}}[[FLOW]]:
 ; SI-NEXT: s_or_saveexec_b64
@@ -140,8 +136,8 @@ exit:
 ; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
 
 ; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
-; SI: buffer_store_dword
-; SI-NEXT: s_waitcnt
+; SI: s_waitcnt
+; SI-NEXT: buffer_store_dword
 
 ; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
 ; SI: s_or_b64 exec, exec
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index f4aba880ff76..1c7769894a27 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -974,6 +974,68 @@ entry:
   ret [2 x i32*] %r
 }
 
+declare arm_aapcscc {i32, i32} @structs_target({i32, i32}, {i32*, float, i32, double})
+
+define arm_aapcscc {i32, i32} @test_structs({i32, i32} %x, {i32*, float, i32, double} %y) {
+; CHECK-LABEL: test_structs
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[Y2_ID:[0-9]+]], offset: 0, size: 4
+; CHECK-DAG: id: [[Y3_ID:[0-9]+]], offset: 8, size: 8
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[Y2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Y2_ID]]
+; CHECK: [[Y2:%[0-9]+]](s32) = G_LOAD [[Y2_ADDR]](p0){{.*}}load 4
+; CHECK: [[Y3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Y3_ID]]
+; CHECK: [[Y3:%[0-9]+]](s64) = G_LOAD [[Y3_ADDR]](p0){{.*}}load 8
+; CHECK: [[X_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[X_1:%[0-9]+]](s64) = G_INSERT [[X_0]], [[X0]](s32), 0
+; CHECK: [[X_2:%[0-9]+]](s64) = G_INSERT [[X_1]], [[X1]](s32), 32
+; CHECK: [[X:%[0-9]+]](s64) = COPY [[X_2]]
+; CHECK: [[Y_0:%[0-9]+]](s192) = IMPLICIT_DEF
+; CHECK: [[Y_1:%[0-9]+]](s192) = G_INSERT [[Y_0]], [[Y0]](s32), 0
+; CHECK: [[Y_2:%[0-9]+]](s192) = G_INSERT [[Y_1]], [[Y1]](s32), 32
+; CHECK: [[Y_3:%[0-9]+]](s192) = G_INSERT [[Y_2]], [[Y2]](s32), 64
+; CHECK: [[Y_4:%[0-9]+]](s192) = G_INSERT [[Y_3]], [[Y3]](s64), 128
+; CHECK: [[Y:%[0-9]+]](s192) = COPY [[Y_4]]
+; CHECK: ADJCALLSTACKDOWN 16, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[X0:%[0-9]+]](s32) = G_EXTRACT [[X]](s64), 0
+; CHECK: [[X1:%[0-9]+]](s32) = G_EXTRACT [[X]](s64), 32
+; CHECK: [[Y0:%[0-9]+]](s32) = G_EXTRACT [[Y]](s192), 0
+; CHECK: [[Y1:%[0-9]+]](s32) = G_EXTRACT [[Y]](s192), 32
+; CHECK: [[Y2:%[0-9]+]](s32) = G_EXTRACT [[Y]](s192), 64
+; CHECK: [[Y3:%[0-9]+]](s64) = G_EXTRACT [[Y]](s192), 128
+; CHECK-DAG: %r0 = COPY [[X0]](s32)
+; CHECK-DAG: %r1 = COPY [[X1]](s32)
+; CHECK-DAG: %r2 = COPY [[Y0]](s32)
+; CHECK-DAG: %r3 = COPY [[Y1]](s32)
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Y2_OFF:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[Y2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Y2_OFF]](s32)
+; CHECK: G_STORE [[Y2]](s32), [[Y2_ADDR]](p0){{.*}}store 4
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Y3_OFF:%[0-9]+]](s32) = G_CONSTANT i32 8
+; CHECK: [[Y3_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Y3_OFF]](s32)
+; CHECK: G_STORE [[Y3]](s64), [[Y3_ADDR]](p0){{.*}}store 8
+; CHECK: BLX @structs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[R_1:%[0-9]+]](s64) = G_INSERT [[R_0]], [[R0]](s32), 0
+; CHECK: [[R_2:%[0-9]+]](s64) = G_INSERT [[R_1]], [[R1]](s32), 32
+; CHECK: [[R:%[0-9]+]](s64) = COPY [[R_2]]
+; CHECK: ADJCALLSTACKUP 16, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R]](s64), 32
+; CHECK: %r0 = COPY [[R0]](s32)
+; CHECK: %r1 = COPY [[R1]](s32)
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+  %r = notail call arm_aapcscc {i32, i32} @structs_target({i32, i32} %x, {i32*, float, i32, double} %y)
+  ret {i32, i32} %r
+}
+
 define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]](s32) = COPY %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
index ef30cb1063f8..34f00aebe1be 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -54,10 +54,15 @@ define [16 x i32] @test_ret_demotion() {
   ret [16 x i32] %res
 }
 
-define void @test_structs({i32, i32} %struct) {
-; CHECK: remark: {{.*}} unable to lower arguments: void ({ i32, i32 })*
-; CHECK-LABEL: warning: Instruction selection used fallback path for test_structs
-  ret void
+%large.struct = type { i32, i32, i32, i32, i32} ; Doesn't fit in R0-R3
+
+declare %large.struct @large_struct_return_target()
+
+define %large.struct @test_large_struct_return() {
+; CHECK: remark: {{.*}} unable to translate instruction: call{{.*}} @large_struct_return_target
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_large_struct_return
+  %r = call %large.struct @large_struct_return_target()
+  ret %large.struct %r
 }
 
 define void @test_vararg_definition(i32 %a, ...) {
diff --git a/test/CodeGen/ARM/cortex-a57-misched-alu.ll b/test/CodeGen/ARM/cortex-a57-misched-alu.ll
new file mode 100644
index 000000000000..960ee87532b0
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-alu.ll
@@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; Check the latency for ALU shifted operand variants.
+;
+; CHECK:       ********** MI Scheduling **********
+; CHECK:      foo:BB#0 entry
+
+; ALU, basic - 1 cyc I0/I1
+; CHECK:      EORrr
+; CHECK:      rdefs left
+; CHECK-NEXT: Latency    : 1
+
+; ALU, shift by immed - 2 cyc M
+; CHECK:      ADDrsi
+; CHECK:      rdefs left
+; CHECK-NEXT: Latency    : 2
+
+; ALU, shift by register, unconditional - 2 cyc M
+; CHECK:      RSBrsr
+; CHECK:      rdefs left
+; CHECK-NEXT: Latency    : 2
+
+; ALU, shift by register, conditional - 2 cyc I0/I1
+; CHECK:      ANDrsr
+; CHECK:      rdefs left
+; CHECK-NEXT: Latency    : 2
+
+; Checking scheduling units
+
+; CHECK:      ** ScheduleDAGMILive::schedule picking next node
+; Skipping COPY
+; CHECK:      ** ScheduleDAGMILive::schedule picking next node
+; CHECK:      Scheduling
+; CHECK-SAME: ANDrsr
+; CHECK:      Ready
+; CHECK-NEXT: A57UnitI
+
+; CHECK:      ** ScheduleDAGMILive::schedule picking next node
+; CHECK:      Scheduling
+; CHECK-SAME: CMPri
+; CHECK:      Ready
+; CHECK-NEXT: A57UnitI
+
+; CHECK:      ** ScheduleDAGMILive::schedule picking next node
+; CHECK:      Scheduling
+; CHECK-SAME: RSBrsr
+; CHECK:      Ready
+; CHECK-NEXT: A57UnitM
+
+; CHECK:      ** ScheduleDAGMILive::schedule picking next node
+; CHECK:      Scheduling
+; CHECK-SAME: ADDrsi
+; CHECK:      Ready
+; CHECK-NEXT: A57UnitM
+
+; CHECK:      ** ScheduleDAGMILive::schedule picking next node
+; CHECK:      Scheduling
+; CHECK-SAME: EORrr
+; CHECK:      Ready
+; CHECK-NEXT: A57UnitI
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8r-arm-none-eabi"
+
+; Function Attrs: norecurse nounwind readnone
+define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
+entry:
+  %xor = xor i32 %a, %b
+  %xor_shl = shl i32 %xor, 2
+  %add = add i32 %xor_shl, %d
+  %add_ashr = ashr i32 %add, %a
+  %sub = sub i32 %add_ashr, %a
+  %sub_lshr_pred = lshr i32 %sub, %c
+  %pred = icmp sgt i32 %a, 4
+  %and = and i32 %sub_lshr_pred, %b
+  %rv = select i1 %pred, i32 %and, i32 %d
+  ret i32 %rv
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-basic.ll b/test/CodeGen/ARM/cortex-a57-misched-basic.ll
new file mode 100644
index 000000000000..2ec50b9d3343
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-basic.ll
@@ -0,0 +1,53 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=A57_SCHED
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic    -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+
+; Check the latency for instructions for both generic and cortex-a57.
+; SDIV should be scheduled at the block's begin (20 cyc of independent M unit).
+;
+; CHECK:       ********** MI Scheduling **********
+; CHECK:      foo:BB#0 entry
+
+; GENERIC:    SDIV
+; GENERIC:    Latency    : 1
+; GENERIC:    EORrr
+; GENERIC:    Latency    : 1
+; GENERIC:    LDRi12
+; GENERIC:    Latency    : 4
+; GENERIC:    ADDrr
+; GENERIC:    Latency    : 1
+; GENERIC:    SUBrr
+; GENERIC:    Latency    : 1
+
+; A57_SCHED:  SDIV
+; A57_SCHED:  Latency    : 20
+; A57_SCHED:  EORrr
+; A57_SCHED:  Latency    : 1
+; A57_SCHED:  LDRi12
+; A57_SCHED:  Latency    : 4
+; A57_SCHED:  ADDrr
+; A57_SCHED:  Latency    : 1
+; A57_SCHED:  SUBrr
+; A57_SCHED:  Latency    : 1
+
+; CHECK:      ** Final schedule for BB#0 ***
+; GENERIC:    LDRi12
+; GENERIC:    SDIV
+; A57_SCHED:  SDIV
+; A57_SCHED:  LDRi12
+; CHECK:      ********** INTERVALS **********
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8r-arm-none-eabi"
+
+; Function Attrs: norecurse nounwind readnone
+define hidden i32 @foo(i32 %a, i32 %b, i32 %c, i32* %d) local_unnamed_addr #0 {
+entry:
+  %xor = xor i32 %c, %b
+  %ld = load i32, i32* %d
+  %add = add nsw i32 %xor, %ld
+  %div = sdiv i32 %a, %b
+  %sub = sub i32 %div, %add
+  ret i32 %sub
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
new file mode 100644
index 000000000000..d54848a6bcf1
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; 
+
+@a = global i32 0, align 4
+@b = global i32 0, align 4
+@c = global i32 0, align 4
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have LDM instruction combined from single-loads
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       LDMIA_UPD
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 4
+; CHECK:       Successors:
+; CHECK:       data
+; CHECK-SAME:  Latency=1
+; CHECK-NEXT:  data
+; CHECK-SAME:  Latency=3
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=3
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=4
+define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
+  %1 = load i32, i32* @a, align 4
+  %2 = load i32, i32* @b, align 4
+  %3 = load i32, i32* @c, align 4
+
+  %ptr_after = getelementptr i32, i32* @a, i32 3
+
+  %ptr_val = ptrtoint i32* %ptr_after to i32
+  %mul1 = mul i32 %ptr_val, %1
+  %mul2 = mul i32 %mul1, %2
+  %mul3 = mul i32 %mul2, %3
+  ret i32 %mul3
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
new file mode 100644
index 000000000000..9cb076651f5b
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
@@ -0,0 +1,28 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have LDM instruction combined from single-loads
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       LDMIA
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 3
+; CHECK:       Successors:
+; CHECK:       data
+; CHECK-SAME:  Latency=3
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=3
+
+define i32 @foo(i32* %a) nounwind optsize {
+entry:
+  %b = getelementptr i32, i32* %a, i32 1
+  %c = getelementptr i32, i32* %a, i32 2 
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %2 = load i32, i32* %c, align 4
+
+  %mul1 = mul i32 %0, %1
+  %mul2 = mul i32 %mul1, %2
+  ret i32 %mul2
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
new file mode 100644
index 000000000000..774b0a907e39
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; N=3 STMIA_UPD should have latency 2cyc and writeback latency 1cyc
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have STM instruction combined from single-stores
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       schedule starting
+; CHECK:       STMIA_UPD
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 2
+; CHECK:       Successors
+; CHECK:       data
+; CHECK-SAME:  Latency=1
+
+define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) {
+
+  %addr.1 = getelementptr i32, i32* %addr, i32 0
+  store i32 %v0, i32* %addr.1
+
+  %addr.2 = getelementptr i32, i32* %addr, i32 1
+  store i32 %v1, i32* %addr.2
+
+  %addr.3 = getelementptr i32, i32* %addr, i32 2
+  store i32 %v2, i32* %addr.3
+  
+  %ptr_after = getelementptr i32, i32* %addr, i32 3
+  %val = ptrtoint i32* %ptr_after to i32
+  
+  %rv1 = mul i32 %val, %v0
+  %rv2 = mul i32 %rv1, %v1
+  %rv3 = mul i32 %rv2, %v2
+
+  ret i32 %rv3
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-stm.ll b/test/CodeGen/ARM/cortex-a57-misched-stm.ll
new file mode 100644
index 000000000000..474f39d84bae
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-stm.ll
@@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; N=3 STMIB should have latency 2cyc
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have STM instruction combined from single-stores
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       schedule starting
+; CHECK:       STMIB
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 2
+
+define i32 @test_stm(i32 %v0, i32 %v1, i32* %addr) {
+
+  %addr.1 = getelementptr i32, i32* %addr, i32 1
+  store i32 %v0, i32* %addr.1
+
+  %addr.2 = getelementptr i32, i32* %addr, i32 2
+  store i32 %v1, i32* %addr.2
+
+  %addr.3 = getelementptr i32, i32* %addr, i32 3
+  %val = ptrtoint i32* %addr to i32
+  store i32 %val, i32* %addr.3
+
+  %rv = add i32 %v0, %v1
+
+  ret i32 %rv
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
new file mode 100644
index 000000000000..a9223e1e2a99
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll
@@ -0,0 +1,77 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; Check latencies of vmul/vfma accumulate chains.
+
+define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       Test1:BB#0
+
+; CHECK:       VMULS
+; > VMULS common latency = 5
+; CHECK:       Latency            : 5
+; CHECK:       Successors:
+; CHECK:       data
+; > VMULS read-advanced latency to VMLAS = 0
+; CHECK-SAME:  Latency=0
+
+; CHECK:       VMLAS
+; > VMLAS common latency = 9
+; CHECK:       Latency            : 9
+; CHECK:       Successors:
+; CHECK:       data
+; > VMLAS read-advanced latency to the next VMLAS = 4
+; CHECK-SAME:  Latency=4
+
+; CHECK:       VMLAS
+; CHECK:       Latency            : 9
+; CHECK:       Successors:
+; CHECK:       data
+; > VMLAS not-optimized latency to VMOVRS = 9
+; CHECK-SAME:  Latency=9
+
+; f1 * f2 + f3 * f4 + f5 * f6  ==>  VMULS, VMLAS, VMLAS
+  %mul1 = fmul float %f1, %f2
+  %mul2 = fmul float %f3, %f4
+  %mul3 = fmul float %f5, %f6
+  %add1 = fadd float %mul1, %mul2
+  %add2 = fadd float %add1, %mul3
+  ret float %add2
+}
+
+; ASIMD form
+define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       Test2:BB#0
+
+; CHECK:       VMULfd
+; > VMULfd common latency = 5
+; CHECK:       Latency            : 5
+; CHECK:       Successors:
+; CHECK:       data
+; VMULfd read-advanced latency to VMLAfd = 0
+; CHECK-SAME:  Latency=0
+
+; CHECK:       VMLAfd
+; > VMLAfd common latency = 9
+; CHECK:       Latency            : 9
+; CHECK:       Successors:
+; CHECK:       data
+; > VMLAfd read-advanced latency to the next VMLAfd = 4
+; CHECK-SAME:  Latency=4
+
+; CHECK:       VMLAfd
+; CHECK:       Latency            : 9
+; CHECK:       Successors:
+; CHECK:       data
+; > VMLAfd not-optimized latency to VMOVRRD = 9
+; CHECK-SAME:  Latency=9
+
+; f1 * f2 + f3 * f4 + f5 * f6  ==>  VMULS, VMLAS, VMLAS
+  %mul1 = fmul <2 x float> %f1, %f2
+  %mul2 = fmul <2 x float> %f3, %f4
+  %mul3 = fmul <2 x float> %f5, %f6
+  %add1 = fadd <2 x float> %mul1, %mul2
+  %add2 = fadd <2 x float> %add1, %mul3
+  ret <2 x float> %add2
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
new file mode 100644
index 000000000000..6cfa823fb969
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
@@ -0,0 +1,50 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; 
+
+@a = global double 0.0, align 4
+@b = global double 0.0, align 4
+@c = global double 0.0, align 4
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       VLDMDIA_UPD
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 6
+; CHECK:       Successors:
+; CHECK:       data
+; CHECK-SAME:  Latency=1
+; CHECK-NEXT:  data
+; CHECK-SAME:  Latency=1
+; CHECK-NEXT:  data
+; CHECK-SAME:  Latency=5
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=5
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=6
+define i32 @bar(i32* %iptr) minsize optsize {
+  %1 = load double, double* @a, align 8
+  %2 = load double, double* @b, align 8
+  %3 = load double, double* @c, align 8
+
+  %ptr_after = getelementptr double, double* @a, i32 3
+
+  %ptr_new_ival = ptrtoint double* %ptr_after to i32
+  %ptr_new = inttoptr i32 %ptr_new_ival to i32*
+
+  store i32 %ptr_new_ival, i32* %iptr, align 8
+  
+  %v1 = fptoui double %1 to i32
+
+  %mul1 = mul i32 %ptr_new_ival, %v1
+
+  %v2 = fptoui double %2 to i32
+  %v3 = fptoui double %3 to i32
+  
+  %mul2 = mul i32 %mul1, %v2
+  %mul3 = mul i32 %mul2, %v3
+  
+  ret i32 %mul3
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
new file mode 100644
index 000000000000..218b5b41a7e4
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
@@ -0,0 +1,30 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VLDM instruction combined from single-loads
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       VLDMDIA
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 6
+; CHECK:       Successors:
+; CHECK:       data
+; CHECK-SAME:  Latency=5
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=5
+; CHECK-NEXT:  data 
+; CHECK-SAME:  Latency=6
+
+define double @foo(double* %a) nounwind optsize {
+entry:
+  %b = getelementptr double, double* %a, i32 1
+  %c = getelementptr double, double* %a, i32 2 
+  %0 = load double, double* %a, align 4
+  %1 = load double, double* %b, align 4
+  %2 = load double, double* %c, align 4
+
+  %mul1 = fmul double %0, %1
+  %mul2 = fmul double %mul1, %2
+  ret double %mul2
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
new file mode 100644
index 000000000000..af1c469d4443
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll
@@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       schedule starting
+; CHECK:       VSTMDIA_UPD
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 4
+; CHECK:       Successors:
+; CHECK:       data
+; CHECK-SAME:  Latency=1
+
+@a = global double 0.0, align 4
+@b = global double 0.0, align 4
+@c = global double 0.0, align 4
+
+define i32 @bar(double* %vptr, i32 %iv1, i32* %iptr) minsize {
+  
+  %vp2 = getelementptr double, double* %vptr, i32 1
+  %vp3 = getelementptr double, double* %vptr, i32 2
+
+  %v1 = load double, double* %vptr, align 8
+  %v2 = load double, double* %vp2, align 8
+  %v3 = load double, double* %vp3, align 8
+
+  store double %v1, double* @a, align 8
+  store double %v2, double* @b, align 8
+  store double %v3, double* @c, align 8
+
+  %ptr_after = getelementptr double, double* @a, i32 3
+
+  %ptr_new_ival = ptrtoint double* %ptr_after to i32
+  %ptr_new = inttoptr i32 %ptr_new_ival to i32*
+
+  store i32 %ptr_new_ival, i32* %iptr, align 8
+
+  %mul1 = mul i32 %ptr_new_ival, %iv1
+
+  ret i32 %mul1
+}
+
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vstm.ll b/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
new file mode 100644
index 000000000000..f31474f66558
--- /dev/null
+++ b/test/CodeGen/ARM/cortex-a57-misched-vstm.ll
@@ -0,0 +1,23 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -misched-postra -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK:       ********** MI Scheduling **********
+; We need second, post-ra scheduling to have VSTM instruction combined from single-stores
+; CHECK:       ********** MI Scheduling **********
+; CHECK:       schedule starting
+; CHECK:       VSTMDIA
+; CHECK:       rdefs left
+; CHECK-NEXT:  Latency            : 2
+
+%bigVec = type [2 x double]
+
+@var = global %bigVec zeroinitializer
+
+define void @bar(%bigVec* %ptr) {
+
+  %tmp = load %bigVec, %bigVec* %ptr
+  store %bigVec %tmp, %bigVec* @var
+
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/global-merge-external.ll b/test/CodeGen/ARM/global-merge-external.ll
index a9e0d199705a..03c977614320 100644
--- a/test/CodeGen/ARM/global-merge-external.ll
+++ b/test/CodeGen/ARM/global-merge-external.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=true  | FileCheck %s --check-prefix=CHECK-MERGE
 ; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=false | FileCheck %s --check-prefix=CHECK-NO-MERGE
 ; RUN: llc < %s -mtriple=arm-macho -arm-global-merge                                 | FileCheck %s --check-prefix=CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -relocation-model=pic           | FileCheck %s --check-prefix=CHECK-NO-MERGE
 
 @x = global i32 0, align 4
 @y = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/newify-crash.ll b/test/CodeGen/Hexagon/newify-crash.ll
new file mode 100644
index 000000000000..705170b13a59
--- /dev/null
+++ b/test/CodeGen/Hexagon/newify-crash.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+;
+; Check that this testcase doesn't crash.
+; CHECK: vadd
+
+target triple = "hexagon"
+
+define void @fred() #0 {
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b7, %b0
+  %v2 = phi i32 [ 0, %b0 ], [ %v16, %b7 ]
+  %v3 = phi <32 x i32> [ undef, %b0 ], [ %v15, %b7 ]
+  %v4 = icmp slt i32 %v2, undef
+  br i1 %v4, label %b5, label %b7
+
+b5:                                               ; preds = %b1
+  %v6 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v3, <32 x i32> undef)
+  br label %b7
+
+b7:                                               ; preds = %b5, %b1
+  %v8 = phi <32 x i32> [ %v6, %b5 ], [ %v3, %b1 ]
+  %v9 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v8, <32 x i32> undef)
+  %v10 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v9, <32 x i32> undef)
+  %v11 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v10, <32 x i32> undef)
+  %v12 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v11, <32 x i32> undef)
+  %v13 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v12, <32 x i32> zeroinitializer)
+  %v14 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v13, <32 x i32> undef)
+  %v15 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %v14, <32 x i32> undef)
+  %v16 = add nsw i32 %v2, 8
+  %v17 = icmp eq i32 %v16, 64
+  br i1 %v17, label %b18, label %b1
+
+b18:                                              ; preds = %b7
+  tail call void @f0() #0
+  ret void
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #1
+declare void @f0() #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/MIR/Generic/runPass.mir b/test/CodeGen/MIR/Generic/runPass.mir
index eeef9d526510..33380d4c6bb4 100644
--- a/test/CodeGen/MIR/Generic/runPass.mir
+++ b/test/CodeGen/MIR/Generic/runPass.mir
@@ -1,4 +1,5 @@
 # RUN: llc -run-pass=greedy -debug-pass=Arguments -o - %s | FileCheck %s
+# RUN: llc -run-pass=regallocbasic -debug-pass=Arguments -o - %s | FileCheck %s
 
 # Check that passes are initialized correctly, so that it's possible to
 # use -run-pass.
diff --git a/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll b/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll
new file mode 100644
index 000000000000..804ea1e5c438
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs < %s | FileCheck %s
+
+define void @f1(i8* %p) {
+entry:
+; CHECK-LABEL: f1:
+; CHECK: lbu16
+; CHECK: sb16
+  %0 = load i8, i8* %p, align 4
+  %a = zext i8 %0 to i32
+  %and = and i32 %a, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i8 0, i8* %p, align 1
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @f2(i16* %p) {
+entry:
+; CHECK-LABEL: f2:
+; CHECK: lhu16
+; CHECK: sh16
+  %0 = load i16, i16* %p, align 2
+  %a = zext i16 %0 to i32
+  %and = and i32 %a, 2
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i16 0, i16* %p, align 2
+  br label %if.end
+
+if.end:
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/scavenging.mir b/test/CodeGen/PowerPC/scavenging.mir
new file mode 100644
index 000000000000..8b5c26230bc6
--- /dev/null
+++ b/test/CodeGen/PowerPC/scavenging.mir
@@ -0,0 +1,149 @@
+# RUN: llc -mtriple=ppc64-- -run-pass scavenger-test -verify-machineinstrs -o - %s | FileCheck %s
+---
+# CHECK-LABEL: name: noscav0
+name: noscav0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK: [[REG0:%r[0-9]+]] = LI 42
+    ; CHECK-NEXT: NOP implicit [[REG0]]
+    %0 : gprc = LI 42
+    NOP implicit %0
+
+    ; CHECK: [[REG1:%r[0-9]+]] = LI 42
+    ; CHECK-NEXT: NOP
+    ; CHECK-NEXT: NOP implicit [[REG1]]
+    ; CHECK-NEXT: NOP
+    ; CHECK-NEXT: NOP implicit [[REG1]]
+    %1 : gprc = LI 42
+    NOP
+    NOP implicit %1
+    NOP
+    NOP implicit %1
+
+    ; CHECK: [[REG2:%r[0-9]+]] = LI 42
+    ; CHECK-NEXT: NOP implicit [[REG2]]
+    %2 : gprc = LI 42
+    NOP implicit %2
+
+    %x0 = IMPLICIT_DEF
+    %x1 = IMPLICIT_DEF
+    %x2 = IMPLICIT_DEF
+    %x3 = IMPLICIT_DEF
+    %x4 = IMPLICIT_DEF
+    %x27 = IMPLICIT_DEF
+    %x28 = IMPLICIT_DEF
+    %x29 = IMPLICIT_DEF
+    %x30 = IMPLICIT_DEF
+
+    ; CHECK-NOT: %x0 = LI 42
+    ; CHECK-NOT: %x1 = LI 42
+    ; CHECK-NOT: %x2 = LI 42
+    ; CHECK-NOT: %x3 = LI 42
+    ; CHECK-NOT: %x4 = LI 42
+    ; CHECK-NOT: %x5 = LI 42
+    ; CHECK-NOT: %x27 = LI 42
+    ; CHECK-NOT: %x28 = LI 42
+    ; CHECK-NOT: %x29 = LI 42
+    ; CHECK-NOT: %x30 = LI 42
+    ; CHECK: [[REG3:%r[0-9]+]] = LI 42
+    ; CHECK-NEXT: %x5 = IMPLICIT_DEF
+    ; CHECK-NEXT: NOP implicit [[REG2]]
+    ; CHECK-NEXT: NOP implicit [[REG3]]
+    %3 : gprc = LI 42
+    %x5 = IMPLICIT_DEF
+    NOP implicit %2
+    NOP implicit %3
+
+    NOP implicit %x0
+    NOP implicit %x1
+    NOP implicit %x2
+    NOP implicit %x3
+    NOP implicit %x4
+    NOP implicit %x5
+    NOP implicit %x27
+    NOP implicit %x28
+    NOP implicit %x29
+    NOP implicit %x30
+...
+---
+# CHECK-LABEL: name: scav0
+name: scav0
+tracksRegLiveness: true
+stack:
+  # variable-sized object should be a reason to reserve an emergency spillslot
+  # in the RegScavenger
+  - { id: 0, type: variable-sized, offset: -32, alignment: 1 }
+body: |
+  bb.0:
+    %x0 = IMPLICIT_DEF
+    %x1 = IMPLICIT_DEF
+    %x2 = IMPLICIT_DEF
+    %x3 = IMPLICIT_DEF
+    %x4 = IMPLICIT_DEF
+    %x5 = IMPLICIT_DEF
+    %x6 = IMPLICIT_DEF
+    %x7 = IMPLICIT_DEF
+    %x8 = IMPLICIT_DEF
+    %x9 = IMPLICIT_DEF
+    %x10 = IMPLICIT_DEF
+    %x11 = IMPLICIT_DEF
+    %x12 = IMPLICIT_DEF
+    %x13 = IMPLICIT_DEF
+    %x14 = IMPLICIT_DEF
+    %x15 = IMPLICIT_DEF
+    %x16 = IMPLICIT_DEF
+    %x17 = IMPLICIT_DEF
+    %x18 = IMPLICIT_DEF
+    %x19 = IMPLICIT_DEF
+    %x20 = IMPLICIT_DEF
+    %x21 = IMPLICIT_DEF
+    %x22 = IMPLICIT_DEF
+    %x23 = IMPLICIT_DEF
+    %x24 = IMPLICIT_DEF
+    %x25 = IMPLICIT_DEF
+    %x26 = IMPLICIT_DEF
+    %x27 = IMPLICIT_DEF
+    %x28 = IMPLICIT_DEF
+    %x29 = IMPLICIT_DEF
+    %x30 = IMPLICIT_DEF
+
+    ; CHECK: STD killed [[SPILLEDREG:%x[0-9]+]]
+    ; CHECK: [[SPILLEDREG]] = LI8 42
+    ; CHECK: NOP implicit [[SPILLEDREG]]
+    ; CHECK: [[SPILLEDREG]] = LD
+    %0 : g8rc = LI8 42
+    NOP implicit %0
+
+    NOP implicit %x0
+    NOP implicit %x1
+    NOP implicit %x2
+    NOP implicit %x3
+    NOP implicit %x4
+    NOP implicit %x5
+    NOP implicit %x6
+    NOP implicit %x7
+    NOP implicit %x8
+    NOP implicit %x9
+    NOP implicit %x10
+    NOP implicit %x11
+    NOP implicit %x12
+    NOP implicit %x13
+    NOP implicit %x14
+    NOP implicit %x15
+    NOP implicit %x16
+    NOP implicit %x17
+    NOP implicit %x18
+    NOP implicit %x19
+    NOP implicit %x20
+    NOP implicit %x21
+    NOP implicit %x22
+    NOP implicit %x23
+    NOP implicit %x24
+    NOP implicit %x25
+    NOP implicit %x26
+    NOP implicit %x27
+    NOP implicit %x28
+    NOP implicit %x29
+    NOP implicit %x30
+...
diff --git a/test/CodeGen/SystemZ/RAbasic-invalid-LR-update.mir b/test/CodeGen/SystemZ/RAbasic-invalid-LR-update.mir
new file mode 100644
index 000000000000..2f532f0a5efb
--- /dev/null
+++ b/test/CodeGen/SystemZ/RAbasic-invalid-LR-update.mir
@@ -0,0 +1,267 @@
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -run-pass=regallocbasic %s -o - | FileCheck %s
+# This test used to assert in RABasic. The problem was when we split live-ranges,
+# we were not updating the LiveRegMatrix properly and the interference calculation
+# wouldn't match what the assignment thought it could do.
+# In other words, this test case needs to trigger live-range splitting to exercise
+# the problem.
+#
+# PR33057
+--- |
+  target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+  target triple = "s390x--linux-gnu"
+  
+  define void @autogen_SD21418() #0 {
+    ret void
+  }
+  
+  attributes #0 = { "target-cpu"="z13" }
+
+...
+
+# CHECK: name: autogen_SD21418
+# Check that at least one live-range has been split
+# CHECK: id: 114, class
+---
+name:            autogen_SD21418
+alignment:       2
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vr128bit }
+  - { id: 1, class: vr128bit }
+  - { id: 2, class: vr128bit }
+  - { id: 3, class: vr64bit }
+  - { id: 4, class: gr64bit }
+  - { id: 5, class: vr128bit }
+  - { id: 6, class: grx32bit }
+  - { id: 7, class: vr128bit }
+  - { id: 8, class: vr128bit }
+  - { id: 9, class: gr32bit }
+  - { id: 10, class: gr64bit }
+  - { id: 11, class: vr128bit }
+  - { id: 12, class: fp64bit }
+  - { id: 13, class: vr64bit }
+  - { id: 14, class: vr64bit }
+  - { id: 15, class: gr64bit }
+  - { id: 16, class: gr128bit }
+  - { id: 17, class: gr64bit }
+  - { id: 18, class: gr32bit }
+  - { id: 19, class: gr32bit }
+  - { id: 20, class: gr128bit }
+  - { id: 21, class: gr32bit }
+  - { id: 22, class: gr64bit }
+  - { id: 23, class: gr32bit }
+  - { id: 24, class: gr32bit }
+  - { id: 25, class: gr128bit }
+  - { id: 26, class: grx32bit }
+  - { id: 27, class: gr64bit }
+  - { id: 28, class: gr64bit }
+  - { id: 29, class: vr128bit }
+  - { id: 30, class: vr128bit }
+  - { id: 31, class: gr64bit }
+  - { id: 32, class: gr32bit }
+  - { id: 33, class: gr32bit }
+  - { id: 34, class: gr128bit }
+  - { id: 35, class: gr32bit }
+  - { id: 36, class: vr128bit }
+  - { id: 37, class: gr64bit }
+  - { id: 38, class: gr32bit }
+  - { id: 39, class: gr32bit }
+  - { id: 40, class: gr128bit }
+  - { id: 41, class: gr32bit }
+  - { id: 42, class: addr64bit }
+  - { id: 43, class: grx32bit }
+  - { id: 44, class: addr64bit }
+  - { id: 45, class: vr64bit }
+  - { id: 46, class: vr64bit }
+  - { id: 47, class: gr32bit }
+  - { id: 48, class: gr32bit }
+  - { id: 49, class: grx32bit }
+  - { id: 50, class: vr64bit }
+  - { id: 51, class: gr64bit }
+  - { id: 52, class: grx32bit }
+  - { id: 53, class: gr32bit }
+  - { id: 54, class: gr64bit }
+  - { id: 55, class: grx32bit }
+  - { id: 56, class: gr32bit }
+  - { id: 57, class: gr128bit }
+  - { id: 58, class: gr128bit }
+  - { id: 59, class: gr32bit }
+  - { id: 60, class: gr64bit }
+  - { id: 61, class: grx32bit }
+  - { id: 62, class: gr32bit }
+  - { id: 63, class: gr64bit }
+  - { id: 64, class: grx32bit }
+  - { id: 65, class: gr32bit }
+  - { id: 66, class: gr128bit }
+  - { id: 67, class: gr128bit }
+  - { id: 68, class: grx32bit }
+  - { id: 69, class: gr64bit }
+  - { id: 70, class: gr64bit }
+  - { id: 71, class: vr128bit }
+  - { id: 72, class: vr128bit }
+  - { id: 73, class: gr64bit }
+  - { id: 74, class: grx32bit }
+  - { id: 75, class: gr32bit }
+  - { id: 76, class: gr64bit }
+  - { id: 77, class: grx32bit }
+  - { id: 78, class: gr32bit }
+  - { id: 79, class: gr128bit }
+  - { id: 80, class: gr128bit }
+  - { id: 81, class: gr32bit }
+  - { id: 82, class: vr128bit }
+  - { id: 83, class: gr64bit }
+  - { id: 84, class: grx32bit }
+  - { id: 85, class: gr32bit }
+  - { id: 86, class: gr64bit }
+  - { id: 87, class: grx32bit }
+  - { id: 88, class: gr32bit }
+  - { id: 89, class: gr128bit }
+  - { id: 90, class: gr128bit }
+  - { id: 91, class: gr32bit }
+  - { id: 92, class: grx32bit }
+  - { id: 93, class: gr64bit }
+  - { id: 94, class: gr32bit }
+  - { id: 95, class: gr32bit }
+  - { id: 96, class: gr32bit }
+  - { id: 97, class: gr64bit }
+  - { id: 98, class: gr64bit }
+  - { id: 99, class: grx32bit }
+  - { id: 100, class: grx32bit }
+  - { id: 101, class: gr128bit }
+  - { id: 102, class: gr128bit }
+  - { id: 103, class: gr128bit }
+  - { id: 104, class: gr64bit }
+  - { id: 105, class: gr128bit }
+  - { id: 106, class: gr128bit }
+  - { id: 107, class: gr64bit }
+  - { id: 108, class: gr128bit }
+  - { id: 109, class: gr128bit }
+  - { id: 110, class: gr64bit }
+  - { id: 111, class: gr128bit }
+  - { id: 112, class: gr128bit }
+  - { id: 113, class: gr64bit }
+constants:       
+  - id:              0
+    value:           double 0xD55960F86F577076
+    alignment:       8
+body:             |
+  bb.0:
+    %11 = VGBM 0
+    %43 = LHIMux 0
+    %44 = LARL %const.0
+    %45 = VL64 %44, 0, _ :: (load 8 from constant-pool)
+  
+  bb.1:
+    ADJCALLSTACKDOWN 0, 0
+    %12 = LZDR
+    %f0d = COPY %12
+    CallBRASL $fmod, killed %f0d, undef %f2d, csr_systemz, implicit-def dead %r14d, implicit-def dead %cc, implicit-def %f0d
+    ADJCALLSTACKUP 0, 0
+    KILL killed %f0d
+  
+  bb.2:
+    %17 = VLGVH %11, _, 0
+    %19 = LHR %17.subreg_l32
+    undef %20.subreg_l64 = LGHI 0
+    %20 = DSGFR %20, %19
+    %22 = VLGVH %11, _, 3
+    %24 = LHR %22.subreg_l32
+    undef %25.subreg_l64 = LGHI 0
+    %25 = DSGFR %25, %24
+    %31 = VLGVH %11, _, 1
+    %33 = LHR %31.subreg_l32
+    undef %34.subreg_l64 = LGHI 0
+    %34 = DSGFR %34, %33
+    %37 = VLGVH %11, _, 2
+    %39 = LHR %37.subreg_l32
+    undef %40.subreg_l64 = LGHI 0
+    %40 = DSGFR %40, %39
+    CHIMux %43, 0, implicit-def %cc
+    BRC 14, 6, %bb.2, implicit killed %cc
+    J %bb.3
+  
+  bb.3:
+    WFCDB undef %46, %45, implicit-def %cc
+    %48 = IPM implicit killed %cc
+    %48 = AFIMux %48, 268435456, implicit-def dead %cc
+    %6 = RISBMux undef %6, %48, 31, 159, 35
+    WFCDB undef %50, %45, implicit-def %cc
+    BRC 15, 6, %bb.1, implicit killed %cc
+    J %bb.4
+  
+  bb.4:
+    %36 = VLVGP %25.subreg_l64, %25.subreg_l64
+    %36 = VLVGH %36, %20.subreg_l32, _, 0
+    %36 = VLVGH %36, %34.subreg_l32, _, 1
+    dead %36 = VLVGH %36, %40.subreg_l32, _, 2
+    %4 = LG undef %42, 0, _ :: (load 8 from `i64* undef`)
+    undef %57.subreg_h64 = LLILL 0
+    undef %66.subreg_h64 = LLILL 0
+    undef %79.subreg_h64 = LLILL 0
+    undef %89.subreg_h64 = LLILL 0
+    %92 = LHIMux 0
+  
+  bb.5:
+  
+  bb.6:
+    %51 = VLGVH undef %7, _, 0
+    %53 = LLHRMux %51.subreg_l32
+    %54 = VLGVH undef %1, _, 0
+    %57.subreg_l32 = LLHRMux %54.subreg_l32
+    %58 = COPY %57
+    %58 = DLR %58, %53
+    %60 = VLGVH undef %7, _, 3
+    %62 = LLHRMux %60.subreg_l32
+    %63 = VLGVH undef %1, _, 3
+    %66.subreg_l32 = LLHRMux %63.subreg_l32
+    %67 = COPY %66
+    %67 = DLR %67, %62
+    %73 = VLGVH undef %7, _, 1
+    %75 = LLHRMux %73.subreg_l32
+    %76 = VLGVH undef %1, _, 1
+    %79.subreg_l32 = LLHRMux %76.subreg_l32
+    %80 = COPY %79
+    %80 = DLR %80, %75
+    %83 = VLGVH undef %7, _, 2
+    %85 = LLHRMux %83.subreg_l32
+    %86 = VLGVH undef %1, _, 2
+    %89.subreg_l32 = LLHRMux %86.subreg_l32
+    %90 = COPY %89
+    %90 = DLR %90, %85
+    CHIMux %92, 0, implicit-def %cc
+    BRC 14, 6, %bb.7, implicit killed %cc
+    J %bb.6
+  
+  bb.7:
+    CGHI undef %93, 0, implicit-def %cc
+    %96 = IPM implicit killed %cc
+    CGHI undef %97, 0, implicit-def %cc
+    BRC 14, 6, %bb.6, implicit killed %cc
+  
+  bb.8:
+    CHIMux %6, 0, implicit-def %cc
+    %10 = LLILL 41639
+    dead %10 = LOCGR %10, %4, 14, 6, implicit killed %cc
+    CHIMux %92, 0, implicit-def %cc
+    BRC 14, 6, %bb.5, implicit killed %cc
+    J %bb.9
+  
+  bb.9:
+    %82 = VLVGP %67.subreg_h64, %67.subreg_h64
+    %82 = VLVGH %82, %58.subreg_hl32, _, 0
+    %82 = VLVGH %82, %80.subreg_hl32, _, 1
+    dead %82 = VLVGH %82, %90.subreg_hl32, _, 2
+    %96 = AFIMux %96, 1879048192, implicit-def dead %cc
+    %96 = SRL %96, _, 31
+    dead %11 = VLVGF %11, %96, _, 1
+    %100 = LHIMux 0
+  
+  bb.10:
+    CHIMux %100, 0, implicit-def %cc
+    BRC 14, 6, %bb.10, implicit killed %cc
+    J %bb.11
+  
+  bb.11:
+    Return
+
+...
diff --git a/test/CodeGen/X86/and-sink.ll b/test/CodeGen/X86/and-sink.ll
index 46e50f2a6a74..0f877e778c70 100644
--- a/test/CodeGen/X86/and-sink.ll
+++ b/test/CodeGen/X86/and-sink.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-unknown -verify-machineinstrs < %s | FileCheck %s
 ; RUN: opt < %s -codegenprepare -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s
 
@@ -8,12 +9,20 @@
 ; Test that 'and' is sunk into bb0.
 define i32 @and_sink1(i32 %a, i1 %c) {
 ; CHECK-LABEL: and_sink1:
-; CHECK: testb $1,
-; CHECK: je
-; CHECK-NOT: andl $4,
-; CHECK: movl $0, A
-; CHECK: testb $4,
-; CHECK: jne
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:  # BB#1: # %bb0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    testb $4, %al
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  # BB#2: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_3: # %bb2
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
 
 ; CHECK-CGP-LABEL: @and_sink1(
 ; CHECK-CGP-NOT: and i32
@@ -37,16 +46,30 @@ bb2:
 ; Test that both 'and' and cmp get sunk to bb1.
 define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
 ; CHECK-LABEL: and_sink2:
-; CHECK: movl $0, A
-; CHECK: testb $1,
-; CHECK: je
-; CHECK-NOT: andl $4,
-; CHECK: movl $0, B
-; CHECK: testb $1,
-; CHECK: je
-; CHECK: movl $0, C
-; CHECK: testb $4,
-; CHECK: jne
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB1_5
+; CHECK-NEXT:  # BB#1: # %bb0.preheader
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_2: # %bb0
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl $0, B
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB1_5
+; CHECK-NEXT:  # BB#3: # %bb1
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    movl $0, C
+; CHECK-NEXT:    testb $4, %cl
+; CHECK-NEXT:    jne .LBB1_2
+; CHECK-NEXT:  # BB#4: # %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_5: # %bb3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
 
 ; CHECK-CGP-LABEL: @and_sink2(
 ; CHECK-CGP-NOT: and i32
@@ -77,12 +100,21 @@ bb3:
 ; Test that CodeGenPrepare doesn't get stuck in a loop sinking and hoisting a masked load.
 define i32 @and_sink3(i1 %c, i32* %p) {
 ; CHECK-LABEL: and_sink3:
-; CHECK: testb $1,
-; CHECK: je
-; CHECK: movzbl
-; CHECK-DAG: movl $0, A
-; CHECK-DAG: testl %
-; CHECK: je
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB2_3
+; CHECK-NEXT:  # BB#1: # %bb0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:  .LBB2_3: # %bb2
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_2: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
 
 ; CHECK-CGP-LABEL: @and_sink3(
 ; CHECK-CGP: load i32
@@ -106,15 +138,26 @@ bb2:
 ; Test that CodeGenPrepare sinks/duplicates non-immediate 'and'.
 define i32 @and_sink4(i32 %a, i32 %b, i1 %c) {
 ; CHECK-LABEL: and_sink4:
-; CHECK: testb $1,
-; CHECK: je
-; CHECK-NOT: andl
-; CHECK-DAG: movl $0, A
-; CHECK-DAG: testl [[REG1:%[a-z0-9]+]], [[REG2:%[a-z0-9]+]]
-; CHECK: jne
-; CHECK-DAG: movl {{%[a-z0-9]+}}, B
-; CHECK-DAG: testl [[REG1]], [[REG2]]
-; CHECK: je
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB3_4
+; CHECK-NEXT:  # BB#1: # %bb0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl %eax, %ecx
+; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    jne .LBB3_4
+; CHECK-NEXT:  # BB#2: # %bb1
+; CHECK-NEXT:    leal (%ecx,%eax), %edx
+; CHECK-NEXT:    testl %eax, %ecx
+; CHECK-NEXT:    movl %edx, B
+; CHECK-NEXT:    je .LBB3_3
+; CHECK-NEXT:  .LBB3_4: # %bb3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB3_3: # %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
 
 ; CHECK-CGP-LABEL: @and_sink4(
 ; CHECK-CGP-NOT: and i32
@@ -146,14 +189,26 @@ bb3:
 ; when it would increase register pressure.
 define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) {
 ; CHECK-LABEL: and_sink5:
-; CHECK: testb $1,
-; CHECK: je
-; CHECK-DAG: andl {{[0-9]+\(%[a-z0-9]+\)}}, [[REG:%[a-z0-9]+]]
-; CHECK-DAG: movl $0, A
-; CHECK: jne
-; CHECK-DAG: movl {{%[a-z0-9]+}}, B
-; CHECK-DAG: testl [[REG]], [[REG]]
-; CHECK: je
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB4_4
+; CHECK-NEXT:  # BB#1: # %bb0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    jne .LBB4_4
+; CHECK-NEXT:  # BB#2: # %bb1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movl %ecx, B
+; CHECK-NEXT:    je .LBB4_3
+; CHECK-NEXT:  .LBB4_4: # %bb3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB4_3: # %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
 
 ; CHECK-CGP-LABEL: @and_sink5(
 ; CHECK-CGP: and i32
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 33ac15de9de9..8f6afa8785d0 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
 
 
 define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: sitof32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %b = sitofp <16 x i32> %a to <16 x float>
@@ -19,7 +19,7 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
 
 define <8 x double> @sltof864(<8 x i64> %a) {
 ; NODQ-LABEL: sltof864:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
@@ -49,7 +49,7 @@ define <8 x double> @sltof864(<8 x i64> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: sltof864:
-; DQ:       ## BB#0:
+; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; DQ-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x double>
@@ -58,7 +58,7 @@ define <8 x double> @sltof864(<8 x i64> %a) {
 
 define <4 x double> @sltof464(<4 x i64> %a) {
 ; NODQ-LABEL: sltof464:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
@@ -74,15 +74,15 @@ define <4 x double> @sltof464(<4 x i64> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: sltof464:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
 ; VLDQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sltof464:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512DQ-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x double>
   ret <4 x double> %b
@@ -90,7 +90,7 @@ define <4 x double> @sltof464(<4 x i64> %a) {
 
 define <2 x float> @sltof2f32(<2 x i64> %a) {
 ; NODQ-LABEL: sltof2f32:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
@@ -101,15 +101,15 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: sltof2f32:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
 ; VLDQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sltof2f32:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %b = sitofp <2 x i64> %a to <2 x float>
@@ -118,7 +118,7 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 
 define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
 ; KNL-LABEL: sltof4f32_mem:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    vmovdqu (%rdi), %ymm0
 ; KNL-NEXT:    vpextrq $1, %xmm0, %rax
 ; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -135,12 +135,12 @@ define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
 ; KNL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sltof4f32_mem:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sltof4f32_mem:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vmovdqu (%rdi), %ymm0
 ; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -158,15 +158,15 @@ define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
 ; VLNODQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sltof4f32_mem:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vmovups (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sltof4f32_mem:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -189,7 +189,7 @@ define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
 
 define <4 x i64> @f64tosl(<4 x double> %a) {
 ; NODQ-LABEL: f64tosl:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
 ; NODQ-NEXT:    vmovq %rax, %xmm2
@@ -207,15 +207,15 @@ define <4 x i64> @f64tosl(<4 x double> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: f64tosl:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
 ; VLDQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: f64tosl:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512DQ-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i64>
   ret <4 x i64> %b
@@ -223,7 +223,7 @@ define <4 x i64> @f64tosl(<4 x double> %a) {
 
 define <4 x i64> @f32tosl(<4 x float> %a) {
 ; NODQ-LABEL: f32tosl:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
 ; NODQ-NEXT:    vcvttss2si %xmm1, %rax
 ; NODQ-NEXT:    vmovq %rax, %xmm1
@@ -241,15 +241,15 @@ define <4 x i64> @f32tosl(<4 x float> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: f32tosl:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
 ; VLDQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: f32tosl:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512DQ-NEXT:    retq
   %b = fptosi <4 x float> %a to <4 x i64>
   ret <4 x i64> %b
@@ -257,7 +257,7 @@ define <4 x i64> @f32tosl(<4 x float> %a) {
 
 define <4 x float> @sltof432(<4 x i64> %a) {
 ; KNL-LABEL: sltof432:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    vpextrq $1, %xmm0, %rax
 ; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; KNL-NEXT:    vmovq %xmm0, %rax
@@ -273,13 +273,13 @@ define <4 x float> @sltof432(<4 x i64> %a) {
 ; KNL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sltof432:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
 ; VLDQ-NEXT:    vzeroupper
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sltof432:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vmovq %xmm0, %rax
@@ -296,15 +296,15 @@ define <4 x float> @sltof432(<4 x i64> %a) {
 ; VLNODQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sltof432:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sltof432:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
@@ -325,7 +325,7 @@ define <4 x float> @sltof432(<4 x i64> %a) {
 
 define <4 x float> @ultof432(<4 x i64> %a) {
 ; KNL-LABEL: ultof432:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    vpextrq $1, %xmm0, %rax
 ; KNL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
 ; KNL-NEXT:    vmovq %xmm0, %rax
@@ -341,13 +341,13 @@ define <4 x float> @ultof432(<4 x i64> %a) {
 ; KNL-NEXT:    retq
 ;
 ; VLDQ-LABEL: ultof432:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
 ; VLDQ-NEXT:    vzeroupper
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: ultof432:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vmovq %xmm0, %rax
@@ -364,15 +364,15 @@ define <4 x float> @ultof432(<4 x i64> %a) {
 ; VLNODQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: ultof432:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: ultof432:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
@@ -393,7 +393,7 @@ define <4 x float> @ultof432(<4 x i64> %a) {
 
 define <8 x double> @ultof64(<8 x i64> %a) {
 ; NODQ-LABEL: ultof64:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
 ; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
@@ -423,7 +423,7 @@ define <8 x double> @ultof64(<8 x i64> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: ultof64:
-; DQ:       ## BB#0:
+; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
 ; DQ-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x double>
@@ -432,7 +432,7 @@ define <8 x double> @ultof64(<8 x i64> %a) {
 
 define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
 ; ALL-LABEL: fptosi00:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %b = fptosi <16 x float> %a to <16 x i32>
@@ -441,7 +441,7 @@ define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
 
 define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
 ; ALL-LABEL: fptoui00:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %b = fptoui <16 x float> %a to <16 x i32>
@@ -450,14 +450,14 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
 
 define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
 ; NOVL-LABEL: fptoui_256:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: fptoui_256:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcvttps2udq %ymm0, %ymm0
 ; VL-NEXT:    retq
   %b = fptoui <8 x float> %a to <8 x i32>
@@ -466,30 +466,30 @@ define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
 
 define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
 ; KNL-LABEL: fptoui_128:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
 ; VL-LABEL: fptoui_128:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: fptoui_128:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: fptoui_128:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %b = fptoui <4 x float> %a to <4 x i32>
@@ -498,7 +498,7 @@ define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
 
 define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
 ; ALL-LABEL: fptoui01:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; ALL-NEXT:    retq
   %b = fptoui <8 x double> %a to <8 x i32>
@@ -507,31 +507,31 @@ define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
 
 define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
 ; KNL-LABEL: fptoui_256d:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; KNL-NEXT:    retq
 ;
 ; VL-LABEL: fptoui_256d:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: fptoui_256d:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: fptoui_256d:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %b = fptoui <4 x double> %a to <4 x i32>
@@ -540,7 +540,7 @@ define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
 
 define <8 x double> @sitof64(<8 x i32> %a) {
 ; ALL-LABEL: sitof64:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
   %b = sitofp <8 x i32> %a to <8 x double>
@@ -548,31 +548,31 @@ define <8 x double> @sitof64(<8 x i32> %a) {
 }
 define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
 ; KNL-LABEL: sitof64_mask:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
 ; KNL-NEXT:    retq
 ;
 ; VLBW-LABEL: sitof64_mask:
-; VLBW:       ## BB#0:
+; VLBW:       # BB#0:
 ; VLBW-NEXT:    kmovd %edi, %k1
 ; VLBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
 ; VLBW-NEXT:    retq
 ;
 ; VLNOBW-LABEL: sitof64_mask:
-; VLNOBW:       ## BB#0:
+; VLNOBW:       # BB#0:
 ; VLNOBW-NEXT:    kmovw %edi, %k1
 ; VLNOBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
 ; VLNOBW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitof64_mask:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    kmovw %edi, %k1
 ; AVX512DQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sitof64_mask:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -583,31 +583,31 @@ define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind
 }
 define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
 ; KNL-LABEL: sitof64_maskz:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    retq
 ;
 ; VLBW-LABEL: sitof64_maskz:
-; VLBW:       ## BB#0:
+; VLBW:       # BB#0:
 ; VLBW-NEXT:    kmovd %edi, %k1
 ; VLBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
 ; VLBW-NEXT:    retq
 ;
 ; VLNOBW-LABEL: sitof64_maskz:
-; VLNOBW:       ## BB#0:
+; VLNOBW:       # BB#0:
 ; VLNOBW-NEXT:    kmovw %edi, %k1
 ; VLNOBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
 ; VLNOBW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitof64_maskz:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    kmovw %edi, %k1
 ; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sitof64_maskz:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
@@ -619,7 +619,7 @@ define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
 
 define <8 x i32> @fptosi01(<8 x double> %a) {
 ; ALL-LABEL: fptosi01:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvttpd2dq %zmm0, %ymm0
 ; ALL-NEXT:    retq
   %b = fptosi <8 x double> %a to <8 x i32>
@@ -628,12 +628,12 @@ define <8 x i32> @fptosi01(<8 x double> %a) {
 
 define <4 x i32> @fptosi03(<4 x double> %a) {
 ; KNL-LABEL: fptosi03:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    vcvttpd2dq %ymm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; AVX512-LABEL: fptosi03:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcvttpd2dq %ymm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -643,14 +643,14 @@ define <4 x i32> @fptosi03(<4 x double> %a) {
 
 define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
 ; NODQ-LABEL: fptrunc00:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
 ; NODQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: fptrunc00:
-; DQ:       ## BB#0:
+; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
 ; DQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
 ; DQ-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
@@ -661,12 +661,12 @@ define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
 
 define <4 x float> @fptrunc01(<4 x double> %b) {
 ; KNL-LABEL: fptrunc01:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; AVX512-LABEL: fptrunc01:
-; AVX512:       ## BB#0:
+; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcvtpd2ps %ymm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -676,7 +676,7 @@ define <4 x float> @fptrunc01(<4 x double> %b) {
 
 define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
 ; KNL-LABEL: fptrunc02:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
 ; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
@@ -684,7 +684,7 @@ define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
 ; KNL-NEXT:    retq
 ;
 ; VL-LABEL: fptrunc02:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpslld $31, %xmm1, %xmm1
 ; VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; VL-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
@@ -692,7 +692,7 @@ define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
 ; VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: fptrunc02:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vcvtpd2ps %ymm0, %xmm0
@@ -701,7 +701,7 @@ define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: fptrunc02:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vcvtpd2ps %ymm0, %xmm0
@@ -715,7 +715,7 @@ define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
 
 define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
 ; ALL-LABEL: fptrunc03:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
 ; ALL-NEXT:    retq
   %ext = extractelement <2 x double> %a0, i32 0
@@ -726,7 +726,7 @@ define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
 
 define <8 x double> @fpext00(<8 x float> %b) nounwind {
 ; ALL-LABEL: fpext00:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
   %a = fpext <8 x float> %b to <8 x double>
@@ -735,14 +735,14 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
 
 define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
 ; NOVL-LABEL: fpext01:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vcvtps2pd %xmm0, %ymm0
 ; NOVL-NEXT:    vcmpltpd %ymm2, %ymm1, %ymm1
 ; NOVL-NEXT:    vandpd %ymm0, %ymm1, %ymm0
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: fpext01:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
 ; VL-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
 ; VL-NEXT:    retq
@@ -754,7 +754,7 @@ define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
 
 define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
 ; ALL-LABEL: fpext02:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
 ; ALL-NEXT:    retq
   %ext = extractelement <4 x float> %a1, i32 0
@@ -765,7 +765,7 @@ define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
 
 define double @funcA(i64* nocapture %e) {
 ; ALL-LABEL: funcA:
-; ALL:       ## BB#0: ## %entry
+; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
 entry:
@@ -776,7 +776,7 @@ entry:
 
 define double @funcB(i32* %e) {
 ; ALL-LABEL: funcB:
-; ALL:       ## BB#0: ## %entry
+; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
 entry:
@@ -787,7 +787,7 @@ entry:
 
 define float @funcC(i32* %e) {
 ; ALL-LABEL: funcC:
-; ALL:       ## BB#0: ## %entry
+; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
 entry:
@@ -798,7 +798,7 @@ entry:
 
 define float @i64tof32(i64* %e) {
 ; ALL-LABEL: i64tof32:
-; ALL:       ## BB#0: ## %entry
+; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
 ; ALL-NEXT:    retq
 entry:
@@ -809,7 +809,7 @@ entry:
 
 define void @fpext() {
 ; ALL-LABEL: fpext:
-; ALL:       ## BB#0: ## %entry
+; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; ALL-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
@@ -825,7 +825,7 @@ entry:
 
 define void @fpround_scalar() nounwind uwtable {
 ; ALL-LABEL: fpround_scalar:
-; ALL:       ## BB#0: ## %entry
+; ALL:       # BB#0: # %entry
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
 ; ALL-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
@@ -841,7 +841,7 @@ entry:
 
 define double @long_to_double(i64 %x) {
 ; ALL-LABEL: long_to_double:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vmovq %rdi, %xmm0
 ; ALL-NEXT:    retq
    %res = bitcast i64 %x to double
@@ -850,7 +850,7 @@ define double @long_to_double(i64 %x) {
 
 define i64 @double_to_long(double %x) {
 ; ALL-LABEL: double_to_long:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vmovq %xmm0, %rax
 ; ALL-NEXT:    retq
    %res = bitcast double %x to i64
@@ -859,7 +859,7 @@ define i64 @double_to_long(double %x) {
 
 define float @int_to_float(i32 %x) {
 ; ALL-LABEL: int_to_float:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vmovd %edi, %xmm0
 ; ALL-NEXT:    retq
    %res = bitcast i32 %x to float
@@ -868,7 +868,7 @@ define float @int_to_float(i32 %x) {
 
 define i32 @float_to_int(float %x) {
 ; ALL-LABEL: float_to_int:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vmovd %xmm0, %eax
 ; ALL-NEXT:    retq
    %res = bitcast float %x to i32
@@ -877,7 +877,7 @@ define i32 @float_to_int(float %x) {
 
 define <16 x double> @uitof64(<16 x i32> %a) nounwind {
 ; NODQ-LABEL: uitof64:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
 ; NODQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
@@ -885,7 +885,7 @@ define <16 x double> @uitof64(<16 x i32> %a) nounwind {
 ; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: uitof64:
-; DQ:       ## BB#0:
+; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
 ; DQ-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
 ; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
@@ -896,31 +896,31 @@ define <16 x double> @uitof64(<16 x i32> %a) nounwind {
 }
 define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
 ; KNL-LABEL: uitof64_mask:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
 ; KNL-NEXT:    retq
 ;
 ; VLBW-LABEL: uitof64_mask:
-; VLBW:       ## BB#0:
+; VLBW:       # BB#0:
 ; VLBW-NEXT:    kmovd %edi, %k1
 ; VLBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
 ; VLBW-NEXT:    retq
 ;
 ; VLNOBW-LABEL: uitof64_mask:
-; VLNOBW:       ## BB#0:
+; VLNOBW:       # BB#0:
 ; VLNOBW-NEXT:    kmovw %edi, %k1
 ; VLNOBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
 ; VLNOBW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitof64_mask:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    kmovw %edi, %k1
 ; AVX512DQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: uitof64_mask:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -931,31 +931,31 @@ define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind
 }
 define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
 ; KNL-LABEL: uitof64_maskz:
-; KNL:       ## BB#0:
+; KNL:       # BB#0:
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    retq
 ;
 ; VLBW-LABEL: uitof64_maskz:
-; VLBW:       ## BB#0:
+; VLBW:       # BB#0:
 ; VLBW-NEXT:    kmovd %edi, %k1
 ; VLBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
 ; VLBW-NEXT:    retq
 ;
 ; VLNOBW-LABEL: uitof64_maskz:
-; VLNOBW:       ## BB#0:
+; VLNOBW:       # BB#0:
 ; VLNOBW-NEXT:    kmovw %edi, %k1
 ; VLNOBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
 ; VLNOBW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitof64_maskz:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    kmovw %edi, %k1
 ; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: uitof64_maskz:
-; AVX512BW:       ## BB#0:
+; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
@@ -967,14 +967,14 @@ define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
 
 define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
 ; NOVL-LABEL: uitof64_256:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitof64_256:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
 ; VL-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x double>
@@ -983,7 +983,7 @@ define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
 
 define <16 x float> @uitof32(<16 x i32> %a) nounwind {
 ; ALL-LABEL: uitof32:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %b = uitofp <16 x i32> %a to <16 x float>
@@ -992,14 +992,14 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
 
 define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
 ; NOVL-LABEL: uitof32_256:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitof32_256:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
 ; VL-NEXT:    retq
   %b = uitofp <8 x i32> %a to <8 x float>
@@ -1008,30 +1008,30 @@ define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
 
 define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
 ; KNL-LABEL: uitof32_128:
-; KNL:       ## BB#0:
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
 ;
 ; VL-LABEL: uitof32_128:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitof32_128:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: uitof32_128:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x float>
@@ -1040,7 +1040,7 @@ define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
 
 define i32 @fptosi02(float %a) nounwind {
 ; ALL-LABEL: fptosi02:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvttss2si %xmm0, %eax
 ; ALL-NEXT:    retq
   %b = fptosi float %a to i32
@@ -1049,7 +1049,7 @@ define i32 @fptosi02(float %a) nounwind {
 
 define i32 @fptoui02(float %a) nounwind {
 ; ALL-LABEL: fptoui02:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvttss2usi %xmm0, %eax
 ; ALL-NEXT:    retq
   %b = fptoui float %a to i32
@@ -1058,7 +1058,7 @@ define i32 @fptoui02(float %a) nounwind {
 
 define float @uitofp02(i32 %a) nounwind {
 ; ALL-LABEL: uitofp02:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
 ; ALL-NEXT:    retq
   %b = uitofp i32 %a to float
@@ -1067,7 +1067,7 @@ define float @uitofp02(i32 %a) nounwind {
 
 define double @uitofp03(i32 %a) nounwind {
 ; ALL-LABEL: uitofp03:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
 ; ALL-NEXT:    retq
   %b = uitofp i32 %a to double
@@ -1076,7 +1076,7 @@ define double @uitofp03(i32 %a) nounwind {
 
 define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
 ; NODQ-LABEL: sitofp_16i1_float:
-; NODQ:       ## BB#0:
+; NODQ:       # BB#0:
 ; NODQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -1084,7 +1084,7 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
 ; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: sitofp_16i1_float:
-; DQ:       ## BB#0:
+; DQ:       # BB#0:
 ; DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; DQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
 ; DQ-NEXT:    vpmovm2d %k0, %zmm0
@@ -1097,7 +1097,7 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
 
 define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
 ; ALL-LABEL: sitofp_16i8_float:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
@@ -1107,7 +1107,7 @@ define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
 
 define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
 ; ALL-LABEL: sitofp_16i16_float:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
@@ -1117,7 +1117,7 @@ define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
 
 define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
 ; ALL-LABEL: sitofp_8i16_double:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; ALL-NEXT:    retq
@@ -1127,7 +1127,7 @@ define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
 
 define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
 ; ALL-LABEL: sitofp_8i8_double:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; ALL-NEXT:    vpslld $24, %ymm0, %ymm0
 ; ALL-NEXT:    vpsrad $24, %ymm0, %ymm0
@@ -1139,7 +1139,7 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
 
 define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 ; NOVLDQ-LABEL: sitofp_16i1_double:
-; NOVLDQ:       ## BB#0:
+; NOVLDQ:       # BB#0:
 ; NOVLDQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; NOVLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
 ; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
@@ -1152,7 +1152,7 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 ; NOVLDQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_16i1_double:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
 ; VLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
 ; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
@@ -1163,7 +1163,7 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_16i1_double:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; VLNODQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
 ; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
@@ -1175,7 +1175,7 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 ; VLNODQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_16i1_double:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
 ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
@@ -1191,7 +1191,7 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 
 define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; NOVLDQ-LABEL: sitofp_8i1_double:
-; NOVLDQ:       ## BB#0:
+; NOVLDQ:       # BB#0:
 ; NOVLDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
 ; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -1200,7 +1200,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; NOVLDQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_8i1_double:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
 ; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
 ; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
@@ -1208,7 +1208,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_8i1_double:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
 ; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -1217,7 +1217,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; VLNODQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_8i1_double:
-; AVX512DQ:       ## BB#0:
+; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
@@ -1230,8 +1230,8 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 
 define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 ; NOVLDQ-LABEL: sitofp_8i1_float:
-; NOVLDQ:       ## BB#0:
-; NOVLDQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVLDQ:       # BB#0:
+; NOVLDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NOVLDQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; NOVLDQ-NEXT:    vcmpltps %zmm0, %zmm1, %k1
 ; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -1240,7 +1240,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 ; NOVLDQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_8i1_float:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; VLDQ-NEXT:    vcmpltps %ymm0, %ymm1, %k0
 ; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
@@ -1248,7 +1248,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_8i1_float:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; VLNODQ-NEXT:    vcmpltps %ymm0, %ymm1, %k1
 ; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -1257,8 +1257,8 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 ; VLNODQ-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_8i1_float:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ:       # BB#0:
+; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm1, %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
@@ -1271,14 +1271,14 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 
 define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
 ; NOVL-LABEL: sitofp_4i1_float:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
 ; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; NOVL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_4i1_float:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
 ; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
@@ -1286,7 +1286,7 @@ define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_4i1_float:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
 ; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -1300,7 +1300,7 @@ define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
 
 define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
 ; NOVL-LABEL: sitofp_4i1_double:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; NOVL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
 ; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
@@ -1308,7 +1308,7 @@ define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_4i1_double:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; VLDQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
 ; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
@@ -1316,7 +1316,7 @@ define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_4i1_double:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; VLNODQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
 ; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -1330,14 +1330,14 @@ define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
 
 define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
 ; NOVL-LABEL: sitofp_2i1_float:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
 ; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; NOVL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_2i1_float:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
 ; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
@@ -1345,7 +1345,7 @@ define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_2i1_float:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
 ; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -1359,7 +1359,7 @@ define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
 
 define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
 ; NOVL-LABEL: sitofp_2i1_double:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
 ; NOVL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1367,7 +1367,7 @@ define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sitofp_2i1_double:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; VLDQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
 ; VLDQ-NEXT:    vpmovm2q %k0, %xmm0
@@ -1375,7 +1375,7 @@ define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: sitofp_2i1_double:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
 ; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -1393,7 +1393,7 @@ define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
 
 define <16 x float> @uitofp_16i8(<16 x i8>%a) {
 ; ALL-LABEL: uitofp_16i8:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
@@ -1403,7 +1403,7 @@ define <16 x float> @uitofp_16i8(<16 x i8>%a) {
 
 define <16 x float> @uitofp_16i16(<16 x i16>%a) {
 ; ALL-LABEL: uitofp_16i16:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; ALL-NEXT:    retq
@@ -1413,7 +1413,7 @@ define <16 x float> @uitofp_16i16(<16 x i16>%a) {
 
 define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
 ; ALL-LABEL: uitofp_16i1_float:
-; ALL:       ## BB#0:
+; ALL:       # BB#0:
 ; ALL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; ALL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
@@ -1426,7 +1426,7 @@ define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
 
 define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
 ; NOVL-LABEL: uitofp_16i1_double:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; NOVL-NEXT:    movq {{.*}}(%rip), %rax
@@ -1440,7 +1440,7 @@ define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_16i1_double:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; VL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; VL-NEXT:    movl {{.*}}(%rip), %eax
@@ -1457,18 +1457,18 @@ define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
 
 define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
 ; NOVL-LABEL: uitofp_8i1_float:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NOVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; NOVL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
 ; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_8i1_float:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
@@ -1481,8 +1481,8 @@ define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
 
 define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
 ; NOVL-LABEL: uitofp_8i1_double:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NOVL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; NOVL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
 ; NOVL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
@@ -1491,7 +1491,7 @@ define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_8i1_double:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
@@ -1504,7 +1504,7 @@ define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
 
 define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
 ; NOVL-LABEL: uitofp_4i1_float:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NOVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
@@ -1512,7 +1512,7 @@ define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_4i1_float:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
@@ -1525,7 +1525,7 @@ define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
 
 define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
 ; NOVL-LABEL: uitofp_4i1_double:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NOVL-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -1533,7 +1533,7 @@ define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_4i1_double:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
@@ -1546,7 +1546,7 @@ define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
 
 define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
 ; NOVL-LABEL: uitofp_2i1_float:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
@@ -1562,7 +1562,7 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uitofp_2i1_float:
-; VL:       ## BB#0:
+; VL:       # BB#0:
 ; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; VL-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
@@ -1576,7 +1576,7 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
 
 define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
 ; NOVL-LABEL: uitofp_2i1_double:
-; NOVL:       ## BB#0:
+; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
@@ -1586,7 +1586,7 @@ define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
 ; NOVL-NEXT:    retq
 ;
 ; VLDQ-LABEL: uitofp_2i1_double:
-; VLDQ:       ## BB#0:
+; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VLDQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; VLDQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
@@ -1595,7 +1595,7 @@ define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
 ; VLDQ-NEXT:    retq
 ;
 ; VLNODQ-LABEL: uitofp_2i1_double:
-; VLNODQ:       ## BB#0:
+; VLNODQ:       # BB#0:
 ; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
 ; VLNODQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index b13965a30ed8..bbe31c5c2ac5 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -1203,3 +1203,35 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
   ret <8 x double> %res2
 }
 
+
+
+; ALL:       .LCPI38
+; ALL-NEXT:  .long	4290379776              # 0xffba0000
+
+; AVX:       .LCPI38
+; AVX-NEXT:  .long	4290379776              # float NaN
+
+define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
+; ALL32-LABEL: f8xi16_i32_NaN:
+; ALL32:       # BB#0:
+; ALL32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL32-NEXT:    retl
+;
+; ALL64-LABEL: f8xi16_i32_NaN:
+; ALL64:       # BB#0:
+; ALL64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; ALL64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; ALL64-NEXT:    retq
+;
+; AVX-LABEL: f8xi16_i32_NaN:
+; AVX:       # BB#0:
+; AVX-NEXT:    vbroadcastss {{\.LCPI.*}}, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+  %res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a
+  %res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1
+  ret <8 x i16> %res2
+}
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index ae0f4406ba0d..1218b68b1be4 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -405,12 +405,7 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
 ;
 ; AVX-LABEL: _clearupper16xi8a:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0  = extractelement <16 x i8> %0, i32 0
   %x1  = extractelement <16 x i8> %0, i32 1
@@ -575,39 +570,10 @@ define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind {
 ; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: _clearupper32xi8a:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX1-NEXT:    vpextrb $1, %xmm1, %esi
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: _clearupper32xi8a:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
-; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: _clearupper32xi8a:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    retq
   %x0  = extractelement <32 x i8> %0, i32 0
   %x1  = extractelement <32 x i8> %0, i32 1
   %x2  = extractelement <32 x i8> %0, i32 2
diff --git a/test/CodeGen/X86/scavenger.mir b/test/CodeGen/X86/scavenger.mir
new file mode 100644
index 000000000000..8d97aeb22cb9
--- /dev/null
+++ b/test/CodeGen/X86/scavenger.mir
@@ -0,0 +1,54 @@
+# RUN: llc -mtriple=i386-- -run-pass scavenger-test -verify-machineinstrs -o - %s | FileCheck %s
+---
+# CHECK-LABEL: name: func0
+name: func0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0 : gr32 = MOV32ri 42
+    %ebp = COPY %0
+...
+---
+# CHECK-LABEL: name: func2
+name: func2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-NOT: %eax = MOV32ri 42
+    ; CHECK: [[REG0:%e[a-z]+]] = MOV32ri 42
+    ; CHECK: %ebp = COPY [[REG0]]
+    %eax = MOV32ri 13
+    %0 : gr32 = MOV32ri 42
+    %ebp = COPY %0
+
+    ; CHECK: [[REG1:%e[a-z]+]] = MOV32ri 23
+    ; CHECK: [[REG2:%e[a-z]+]] = MOV32ri 7
+    ; CHECK: [[REG1]] = ADD32ri8 [[REG1]], 5, implicit-def dead %eflags
+    %1 : gr32 = MOV32ri 23
+    %2 : gr32 = MOV32ri 7
+    %1 = ADD32ri8 %1, 5, implicit-def dead %eflags
+
+    NOOP implicit %ebp
+
+    ; CHECK: NOOP implicit [[REG2]]
+    ; CHECK: NOOP implicit [[REG1]]
+    NOOP implicit %2
+    NOOP implicit %1
+    RETQ %eax
+...
+---
+# Defs without uses are currently broken
+#name: func3
+#tracksRegLiveness: true
+#body: |
+#  bb.0:
+#    dead %0 : gr32 = MOV32ri 42
+...
+---
+# Uses without defs are currently broken (and honestly not that useful).
+#name: func3
+#tracksRegLiveness: true
+#body: |
+#  bb.0:
+#    NOOP undef implicit %0 : gr32
+...
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 1afef86a5f11..7c2937936313 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -15,6 +15,7 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
 ; CHECK-NEXT:    cmovneq %rdi, %rsi
 ; CHECK-NEXT:    movl (%rsi), %eax
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test1:
 ; MCU:       # BB#0:
@@ -55,6 +56,7 @@ define i32 @test2() nounwind {
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB1_1: ## %bb90
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test2:
 ; MCU:       # BB#0: # %entry
@@ -100,6 +102,7 @@ define float @test3(i32 %x) nounwind readnone {
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rcx
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test3:
 ; MCU:       # BB#0: # %entry
@@ -123,6 +126,7 @@ define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movsbl (%rdi,%rax,4), %eax
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test4:
 ; MCU:       # BB#0: # %entry
@@ -157,6 +161,7 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; CHECK-NEXT:    movd %xmm0, (%rsi)
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test5:
 ; MCU:       # BB#0:
@@ -196,6 +201,7 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK-NEXT:    mulps %xmm0, %xmm0
 ; CHECK-NEXT:    movaps %xmm0, (%rsi)
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test6:
 ; MCU:       # BB#0:
@@ -267,6 +273,7 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rcx
 ; CHECK-NEXT:    fldt (%rax,%rcx)
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test7:
 ; MCU:       # BB#0:
@@ -319,6 +326,7 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; GENERIC-NEXT:    movq %xmm0, 16(%rsi)
 ; GENERIC-NEXT:    movdqa %xmm1, (%rsi)
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test8:
 ; ATOM:       ## BB#0:
@@ -358,6 +366,7 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; ATOM-NEXT:    movq %xmm0, 16(%rsi)
 ; ATOM-NEXT:    movdqa %xmm1, (%rsi)
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test8:
 ; MCU:       # BB#0:
@@ -448,6 +457,7 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; GENERIC-NEXT:    sbbq %rax, %rax
 ; GENERIC-NEXT:    orq %rsi, %rax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test9:
 ; ATOM:       ## BB#0:
@@ -457,6 +467,7 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test9:
 ; MCU:       # BB#0:
@@ -483,6 +494,7 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; GENERIC-NEXT:    sbbq %rax, %rax
 ; GENERIC-NEXT:    orq %rsi, %rax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test9a:
 ; ATOM:       ## BB#0:
@@ -492,6 +504,7 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test9a:
 ; MCU:       # BB#0:
@@ -516,6 +529,7 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; GENERIC-NEXT:    sbbq %rax, %rax
 ; GENERIC-NEXT:    orq %rsi, %rax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test9b:
 ; ATOM:       ## BB#0:
@@ -525,6 +539,7 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test9b:
 ; MCU:       # BB#0:
@@ -552,6 +567,7 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; GENERIC-NEXT:    sbbq %rax, %rax
 ; GENERIC-NEXT:    orq $1, %rax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test10:
 ; ATOM:       ## BB#0:
@@ -561,6 +577,7 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test10:
 ; MCU:       # BB#0:
@@ -586,6 +603,7 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK-NEXT:    notq %rax
 ; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test11:
 ; MCU:       # BB#0:
@@ -612,6 +630,7 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK-NEXT:    notq %rax
 ; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test11a:
 ; MCU:       # BB#0:
@@ -641,6 +660,7 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
 ; GENERIC-NEXT:    movq $-1, %rdi
 ; GENERIC-NEXT:    cmovnoq %rax, %rdi
 ; GENERIC-NEXT:    jmp __Znam ## TAILCALL
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test12:
 ; ATOM:       ## BB#0: ## %entry
@@ -650,6 +670,7 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
 ; ATOM-NEXT:    movq $-1, %rdi
 ; ATOM-NEXT:    cmovnoq %rax, %rdi
 ; ATOM-NEXT:    jmp __Znam ## TAILCALL
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test12:
 ; MCU:       # BB#0: # %entry
@@ -700,6 +721,7 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; GENERIC-NEXT:    cmpl %esi, %edi
 ; GENERIC-NEXT:    sbbl %eax, %eax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test13:
 ; ATOM:       ## BB#0:
@@ -710,6 +732,7 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test13:
 ; MCU:       # BB#0:
@@ -728,6 +751,7 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; GENERIC-NEXT:    sbbl %eax, %eax
 ; GENERIC-NEXT:    notl %eax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test14:
 ; ATOM:       ## BB#0:
@@ -737,6 +761,7 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test14:
 ; MCU:       # BB#0:
@@ -756,6 +781,7 @@ define i32 @test15(i32 %x) nounwind {
 ; GENERIC-NEXT:    negl %edi
 ; GENERIC-NEXT:    sbbl %eax, %eax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test15:
 ; ATOM:       ## BB#0: ## %entry
@@ -766,6 +792,7 @@ define i32 @test15(i32 %x) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test15:
 ; MCU:       # BB#0: # %entry
@@ -817,6 +844,7 @@ define i16 @test17(i16 %x) nounwind {
 ; GENERIC-NEXT:    negw %di
 ; GENERIC-NEXT:    sbbw %ax, %ax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test17:
 ; ATOM:       ## BB#0: ## %entry
@@ -827,6 +855,7 @@ define i16 @test17(i16 %x) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test17:
 ; MCU:       # BB#0: # %entry
@@ -846,6 +875,7 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 ; GENERIC-NEXT:    cmovgel %edx, %esi
 ; GENERIC-NEXT:    movl %esi, %eax
 ; GENERIC-NEXT:    retq
+; GENERIC-NEXT:    ## -- End function
 ;
 ; ATOM-LABEL: test18:
 ; ATOM:       ## BB#0:
@@ -855,6 +885,7 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+; ATOM-NEXT:    ## -- End function
 ;
 ; MCU-LABEL: test18:
 ; MCU:       # BB#0:
diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll
index 41f5d2d5be23..7f35258377ec 100644
--- a/test/CodeGen/X86/shrink-compare.ll
+++ b/test/CodeGen/X86/shrink-compare.ll
@@ -1,8 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
 
 declare void @bar()
 
 define void @test1(i32* nocapture %X) nounwind minsize {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $47, (%rdi)
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %tmp1 = load i32, i32* %X, align 4
   %and = and i32 %tmp1, 255
@@ -15,11 +22,15 @@ if.then:
 
 if.end:
   ret void
-; CHECK-LABEL: test1:
-; CHECK: cmpb $47, (%{{rdi|rcx}})
 }
 
 define void @test2(i32 %X) nounwind minsize {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 47
@@ -31,11 +42,15 @@ if.then:
 
 if.end:
   ret void
-; CHECK-LABEL: test2:
-; CHECK: cmpb $47, %{{dil|cl}}
 }
 
 define void @test3(i32 %X) nounwind minsize {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 255
@@ -47,12 +62,22 @@ if.then:
 
 if.end:
   ret void
-; CHECK-LABEL: test3:
-; CHECK: cmpb $-1, %{{dil|cl}}
 }
 
 ; PR16083
 define i1 @test4(i64 %a, i32 %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # BB#2: # %lor.end
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_1: # %lor.rhs
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq
 entry:
   %tobool = icmp ne i32 %b, 0
   br i1 %tobool, label %lor.end, label %lor.rhs
@@ -71,6 +96,16 @@ lor.end:                                          ; preds = %lor.rhs, %entry
 
 ; PR16551
 define void @test5(i32 %X) nounwind minsize {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movzbl x+{{.*}}(%rip), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    movzwl x+{{.*}}(%rip), %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
+; CHECK-NEXT:    jne bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
   %bf.lshr = lshr i56 %bf.load, 32
@@ -84,17 +119,16 @@ if.then:
 
 if.end:
   ret void
-
-; CHECK-LABEL: test5:
-; CHECK-NOT: cmpl $1,{{.*}}x+4
-; CHECK: ret
 }
 
-; CHECK-LABEL: test2_1:
-; CHECK: movzbl
-; CHECK: cmpl $256
-; CHECK: je bar
 define void @test2_1(i32 %X) nounwind minsize {
+; CHECK-LABEL: test2_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    cmpl $256, %eax # imm = 0x100
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 256
@@ -108,9 +142,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_1:
-; CHECK: cmpb $1, %{{dil|cl}}
 define void @test_sext_i8_icmp_1(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, 1
@@ -124,9 +162,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_47:
-; CHECK: cmpb $47, %{{dil|cl}}
 define void @test_sext_i8_icmp_47(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_47:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, 47
@@ -140,9 +182,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_127:
-; CHECK: cmpb $127, %{{dil|cl}}
 define void @test_sext_i8_icmp_127(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_127:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, 127
@@ -156,9 +202,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_neg1:
-; CHECK: cmpb $-1, %{{dil|cl}}
 define void @test_sext_i8_icmp_neg1(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_neg1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, -1
@@ -172,9 +222,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_neg2:
-; CHECK: cmpb $-2, %{{dil|cl}}
 define void @test_sext_i8_icmp_neg2(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_neg2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $-2, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, -2
@@ -188,9 +242,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_neg127:
-; CHECK: cmpb $-127, %{{dil|cl}}
 define void @test_sext_i8_icmp_neg127(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_neg127:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $-127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, -127
@@ -204,9 +262,13 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_neg128:
-; CHECK: cmpb $-128, %{{dil|cl}}
 define void @test_sext_i8_icmp_neg128(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_neg128:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpb $-128, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, -128
@@ -220,11 +282,14 @@ if.end:
   ret void
 }
 
-; CHECK-LABEL: test_sext_i8_icmp_255:
-; CHECK: movb $1,
-; CHECK: testb
-; CHECK: je bar
 define void @test_sext_i8_icmp_255(i8 %x) nounwind minsize {
+; CHECK-LABEL: test_sext_i8_icmp_255:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # BB#1: # %if.end
+; CHECK-NEXT:    retq
 entry:
   %sext = sext i8 %x to i32
   %cmp = icmp eq i32 %sext, 255
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 6d51fb54f8b8..79b949a6ccb1 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -14,6 +14,7 @@ define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
 	%tmp3 = load <8 x i16>, <8 x i16>* %old
 	%tmp6 = shufflevector <8 x i16> %tmp3,
@@ -32,6 +33,7 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ; X64-NEXT:    andps (%rdi), %xmm0
 ; X64-NEXT:    orps %xmm1, %xmm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = load <8 x i16>, <8 x i16>* %B
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
@@ -48,6 +50,7 @@ define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-NEXT:    pandn %xmm1, %xmm2
 ; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
 }
@@ -61,6 +64,7 @@ define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
 	ret <8 x i16> %tmp
 }
@@ -73,6 +77,7 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
 	ret <8 x i16> %tmp
 }
@@ -83,6 +88,7 @@ define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
 	ret <8 x i16> %tmp
 }
@@ -92,6 +98,7 @@ define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64:       ## BB#0:
 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
 }
@@ -102,6 +109,7 @@ define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
 	ret <8 x i16> %tmp
 }
@@ -113,6 +121,7 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = load <2 x i64>, <2 x i64>* %A
 	%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>
 	%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0
@@ -143,6 +152,7 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
 ; X64-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X64-NEXT:    movapd %xmm0, (%rdi)
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 	%tmp = load <4 x float>, <4 x float>* %r
 	%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*
 	%tmp.upgrd.4 = load double, double* %tmp.upgrd.3
@@ -179,6 +189,7 @@ define void @t10() nounwind {
 ; X64-NEXT:    movq _g2@{{.*}}(%rip), %rax
 ; X64-NEXT:    movq %xmm0, (%rax)
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
   load <4 x i32>, <4 x i32>* @g1, align 16
   bitcast <4 x i32> %1 to <8 x i16>
   shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
@@ -196,6 +207,7 @@ define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-NEXT:    psrld $16, %xmm0
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
 	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
 	ret <8 x i16> %tmp7
@@ -209,6 +221,7 @@ define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
@@ -222,6 +235,7 @@ define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
@@ -234,6 +248,7 @@ define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
@@ -247,6 +262,7 @@ define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
   %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
   ret <8 x i16> %tmp8
@@ -260,6 +276,7 @@ define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
   %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
   %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
@@ -275,6 +292,7 @@ define <4 x i32> @t17() nounwind {
 ; X64-NEXT:    pxor %xmm1, %xmm1
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    retq
+; X64-NEXT:    ## -- End function
 entry:
   %tmp1 = load <4 x float>, <4 x float>* undef, align 16
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index a00d47bb13e9..f937d484ce0d 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1926,5 +1926,19 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %6
 }
 
+define <4 x float> @stack_nofold_insertps(<8 x float> %a0, <8 x float> %a1) {
+; Cannot fold this without changing the immediate.
+; CHECK-LABEL: stack_nofold_insertps
+; CHECK:       32-byte Spill
+; CHECK:       nop
+; CHECK:       32-byte Reload
+; CHECK:       vinsertps $179, {{%xmm., %xmm., %xmm.}}
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %v0 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v0, <4 x float> %v1, i8 179)
+  ret <4 x float> %res
+}
+
 attributes #0 = { "unsafe-fp-math"="false" }
 attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
index 9f5418432abc..b8e5c82913a5 100644
--- a/test/CodeGen/X86/statepoint-allocas.ll
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 ; Check that we can lower a use of an alloca both as a deopt value (where the
 ; exact meaning is up to the consumer of the stackmap) and as an explicit spill
 ; slot used for GC.  
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index 6e5cdd605122..bd2dd53b654a 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 ; This file contains a collection of basic tests to ensure we didn't
 ; screw up normal call lowering when there are no deopt or gc arguments.
 
diff --git a/test/CodeGen/X86/statepoint-far-call.ll b/test/CodeGen/X86/statepoint-far-call.ll
index dc49061f6461..9f9b684efae8 100644
--- a/test/CodeGen/X86/statepoint-far-call.ll
+++ b/test/CodeGen/X86/statepoint-far-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 ; Test to check that Statepoints with X64 far-immediate targets
 ; are lowered correctly to an indirect call via a scratch register.
 
diff --git a/test/CodeGen/X86/statepoint-forward.ll b/test/CodeGen/X86/statepoint-forward.ll
index d97bc0c75602..bee4b5ac884e 100644
--- a/test/CodeGen/X86/statepoint-forward.ll
+++ b/test/CodeGen/X86/statepoint-forward.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -O3 -S < %s | FileCheck --check-prefix=CHECK-OPT %s
-; RUN: llc < %s | FileCheck --check-prefix=CHECK-LLC %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LLC %s
 ; These tests are targetted at making sure we don't retain information
 ; about memory which contains potential gc references across a statepoint.
 ; They're carefully written to only outlaw forwarding of references. 
diff --git a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index 11dbe9e2e6c1..b88ca03805f2 100644
--- a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 ; This file contains a collection of basic tests to ensure we didn't
 ; screw up normal call lowering when a statepoint is a GC transition.
 
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 3e8b8ca49f1d..29f8e3ed4f78 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s 2>&1 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
 target triple = "x86_64-pc-linux-gnu"
 
diff --git a/test/CodeGen/X86/statepoint-live-in.ll b/test/CodeGen/X86/statepoint-live-in.ll
index abe2b0a7acc8..aaa4d7c8422a 100644
--- a/test/CodeGen/X86/statepoint-live-in.ll
+++ b/test/CodeGen/X86/statepoint-live-in.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc -verify-machineinstrs -O3 < %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
@@ -6,38 +7,70 @@ declare void @bar() #0
 declare void @baz()
 
 define void @test1(i32 %a) gc "statepoint-example" {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+;
 entry:
 ; We expect the argument to be passed in an extra register to bar
-; CHECK-LABEL: test1
-; CHECK:       pushq	%rax
-; CHECK-NEXT: Lcfi0:
-; CHECK-NEXT:  .cfi_def_cfa_offset 16
-; CHECK-NEXT: callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
   ret void
 }
 
 define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:  Lcfi1:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi2:
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  Lcfi3:
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:  Lcfi4:
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:  Lcfi5:
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+;
 entry:
 ; Because the first call clobbers esi, we have to move the values into
 ; new registers.  Note that they stay in the registers for both calls.
-; CHECK-LABEL: @test2
-; CHECK:       movl	%esi, %ebx
-; CHECK-NEXT:  movl	%edi, %ebp
-; CHECK-NEXT: callq	_bar
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 2, i32 %a, i32 %b)
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 2, i32 %b, i32 %a)
   ret void
 }
 
 define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i) gc "statepoint-example" {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  Lcfi6:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+;
 entry:
-; TODO: We should have folded the reload into the statepoint.
-; CHECK-LABEL: @test3
-; CHECK:       	pushq %rax
-; CHECK-NEXT: 	Lcfi
-; CHECK-NEXT:   .cfi_def_cfa_offset 16
-; CHECK-NEXT:   callq	_bar
+; We directly reference the argument slot
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 9, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i)
   ret void
 }
@@ -47,25 +80,39 @@ entry:
 ; also ends up being a good test of whether we can fold loads from immutable
 ; stack slots into the statepoint.
 define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
+; CHECK-LABEL: test4:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  Lcfi7:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+;
 entry:
-; CHECK-LABEL: test4
-; CHECK:        pushq %rax
-; CHECK-NEXT: 	Lcfi
-; CHECK-NEXT:   .cfi_def_cfa_offset 16
-; CHECK-NEXT:   callq	_bar
   %statepoint_token1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 26, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z)
   ret void
 }
 
 ; A live-through gc-value must be spilled even if it is also a live-in deopt
 ; value.  For live-in, we could technically report the register copy, but from
-; a code quality perspective it's better to reuse the required stack slot so 
+; a code quality perspective it's better to reuse the required stack slot so
 ; as to put less stress on the register allocator for no benefit.
 define  i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-example" {
+; CHECK-LABEL: test5:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:  Lcfi8:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %rsi, (%rsp)
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:    movq (%rsp), %rax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+;
 entry:
-; CHECK-LABEL: test5
-; CHECK:        movq	%rsi, (%rsp)
-; CHECK-NEXT:   callq	_bar
   %token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a, i32 addrspace(1)* %p, i32 addrspace(1)* %p)
   %p2 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %token,  i32 9, i32 9)
   ret i32 addrspace(1)* %p2
@@ -73,14 +120,27 @@ entry:
 
 ; Show the interaction of live-through spilling followed by live-in.
 define void @test6(i32 %a) gc "statepoint-example" {
+; CHECK-LABEL: test6:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi9:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    subq $16, %rsp
+; CHECK-NEXT:  Lcfi10:
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:  Lcfi11:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    callq _baz
+; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:    callq _bar
+; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:    addq $16, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+;
 entry:
-; TODO: We could have reused the previous spill slot at zero additional cost.
-; CHECK-LABEL: test6
-; CHECK:        movl %edi, %ebx
-; CHECK:        movl %ebx, 12(%rsp)
-; CHECK-NEXT:   callq	_baz
-; CHECK-NEXT:  Ltmp
-; CHECK-NEXT:   callq	_bar
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 %a)
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 %a)
   ret void
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index 5c27898f284a..b16426eae3d5 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -stack-symbol-ordering=0 < %s | FileCheck %s
 
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index 0506381b9ec2..966f66815f92 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
-; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -stack-symbol-ordering=0 -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -stack-symbol-ordering=0 -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
 
 ; This test is a sanity check to ensure statepoints are generating StackMap
 ; sections correctly.  This is not intended to be a rigorous test of the 
diff --git a/test/CodeGen/X86/statepoint-uniqueing.ll b/test/CodeGen/X86/statepoint-uniqueing.ll
index e791bc6b2333..a5fa1f2d99c9 100644
--- a/test/CodeGen/X86/statepoint-uniqueing.ll
+++ b/test/CodeGen/X86/statepoint-uniqueing.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 
 ; Checks for a crash we had when two gc.relocate calls would
 ; relocating identical values
diff --git a/test/CodeGen/X86/statepoint-vector-bad-spill.ll b/test/CodeGen/X86/statepoint-vector-bad-spill.ll
index 848988589cb0..7c55491bb1be 100644
--- a/test/CodeGen/X86/statepoint-vector-bad-spill.ll
+++ b/test/CodeGen/X86/statepoint-vector-bad-spill.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -O3 < %s | FileCheck %s
 
 ; This is checking for a crash.
 
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
index 000e88742880..5bc8f983ff06 100644
--- a/test/CodeGen/X86/statepoint-vector.ll
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stack-symbol-ordering=0 -mcpu=nehalem -debug-only=stackmaps < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -stack-symbol-ordering=0 -mcpu=nehalem -debug-only=stackmaps < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/vector-unsigned-cmp.ll b/test/CodeGen/X86/vector-unsigned-cmp.ll
new file mode 100644
index 000000000000..fc246669992c
--- /dev/null
+++ b/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; PR33276 - https://bugs.llvm.org/show_bug.cgi?id=33276
+; If both operands of an unsigned icmp are known non-negative, then
+; we don't need to flip the sign bits in order to map to signed pcmpgt*.
+
+define <2 x i1> @ugt_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: ugt_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ugt_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <2 x i64> %x, <i64 1, i64 1>
+  %sh2 = lshr <2 x i64> %y, <i64 1, i64 1>
+  %cmp = icmp ugt <2 x i64> %sh1, %sh2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @ult_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: ult_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ult_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <2 x i64> %x, <i64 1, i64 1>
+  %sh2 = lshr <2 x i64> %y, <i64 1, i64 1>
+  %cmp = icmp ult <2 x i64> %sh1, %sh2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @uge_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: uge_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uge_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <2 x i64> %x, <i64 1, i64 1>
+  %sh2 = lshr <2 x i64> %y, <i64 1, i64 1>
+  %cmp = icmp uge <2 x i64> %sh1, %sh2
+  ret <2 x i1> %cmp
+}
+
+define <2 x i1> @ule_v2i64(<2 x i64> %x, <2 x i64> %y) {
+; SSE-LABEL: ule_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ule_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <2 x i64> %x, <i64 1, i64 1>
+  %sh2 = lshr <2 x i64> %y, <i64 1, i64 1>
+  %cmp = icmp ule <2 x i64> %sh1, %sh2
+  ret <2 x i1> %cmp
+}
+
+define <4 x i1> @ugt_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: ugt_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: ugt_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ugt_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %sh1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %sh2 = lshr <4 x i32> %y, <i32 1, i32 1, i32 1, i32 1>
+  %cmp = icmp ugt <4 x i32> %sh1, %sh2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @ult_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: ult_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: ult_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: ult_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
+  %sh1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %sh2 = lshr <4 x i32> %y, <i32 1, i32 1, i32 1, i32 1>
+  %cmp = icmp ult <4 x i32> %sh1, %sh2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @uge_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SSE2-LABEL: uge_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uge_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrld $1, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm1
+; SSE41-NEXT:    pmaxud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uge_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %sh2 = lshr <4 x i32> %y, <i32 1, i32 1, i32 1, i32 1>
+  %cmp = icmp uge <4 x i32> %sh1, %sh2
+  ret <4 x i1> %cmp
+}
+
+define <4 x i1> @ule_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; SSE2-LABEL: ule_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ule_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrld $1, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm1
+; SSE41-NEXT:    pminud %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: ule_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %sh2 = lshr <4 x i32> %y, <i32 1, i32 1, i32 1, i32 1>
+  %cmp = icmp ule <4 x i32> %sh1, %sh2
+  ret <4 x i1> %cmp
+}
+
+define <8 x i1> @ugt_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SSE-LABEL: ugt_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ugt_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %sh2 = lshr <8 x i16> %y, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %cmp = icmp ugt <8 x i16> %sh1, %sh2
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @ult_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SSE-LABEL: ult_v8i16:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtw %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ult_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %sh2 = lshr <8 x i16> %y, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %cmp = icmp ult <8 x i16> %sh1, %sh2
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @uge_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SSE2-LABEL: uge_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: uge_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    pmaxuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: uge_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %sh2 = lshr <8 x i16> %y, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %cmp = icmp uge <8 x i16> %sh1, %sh2
+  ret <8 x i1> %cmp
+}
+
+define <8 x i1> @ule_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; SSE2-LABEL: ule_v8i16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psubusw %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ule_v8i16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    pminuw %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: ule_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %sh2 = lshr <8 x i16> %y, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %cmp = icmp ule <8 x i16> %sh1, %sh2
+  ret <8 x i1> %cmp
+}
+
+define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SSE-LABEL: ugt_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ugt_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %cmp = icmp ugt <16 x i8> %sh1, %sh2
+  ret <16 x i1> %cmp
+}
+
+define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SSE-LABEL: ult_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ult_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %cmp = icmp ult <16 x i8> %sh1, %sh2
+  ret <16 x i1> %cmp
+}
+
+define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SSE-LABEL: uge_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pmaxub %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: uge_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %cmp = icmp uge <16 x i8> %sh1, %sh2
+  ret <16 x i1> %cmp
+}
+
+define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) {
+; SSE-LABEL: ule_v16i8:
+; SSE:       # BB#0:
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pminub %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ule_v16i8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %sh1 = lshr <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %sh2 = lshr <16 x i8> %y, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %cmp = icmp ule <16 x i8> %sh1, %sh2
+  ret <16 x i1> %cmp
+}
+
diff --git a/test/CodeGen/X86/wide-fma-contraction.ll b/test/CodeGen/X86/wide-fma-contraction.ll
index f51f917fbac9..99e03c891c00 100644
--- a/test/CodeGen/X86/wide-fma-contraction.ll
+++ b/test/CodeGen/X86/wide-fma-contraction.ll
@@ -1,26 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=x86 -mcpu=bdver2 -mattr=-fma -mtriple=x86_64-apple-darwin < %s | FileCheck %s
 ; RUN: llc -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
 
 ; CHECK-LABEL: fmafunc
 ; CHECK-NOFMA-LABEL: fmafunc
 define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
+; CHECK-LABEL: fmafunc:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:  Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:  Lcfi1:
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:  Lcfi2:
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    andl $-32, %esp
+; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    vfmaddps 8(%ebp), %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vfmaddps 40(%ebp), %ymm3, %ymm1, %ymm1
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+;
+; CHECK-NOFMA-LABEL: fmafunc:
+; CHECK-NOFMA:       ## BB#0:
+; CHECK-NOFMA-NEXT:    pushl %ebp
+; CHECK-NOFMA-NEXT:  Lcfi0:
+; CHECK-NOFMA-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NOFMA-NEXT:  Lcfi1:
+; CHECK-NOFMA-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NOFMA-NEXT:    movl %esp, %ebp
+; CHECK-NOFMA-NEXT:  Lcfi2:
+; CHECK-NOFMA-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NOFMA-NEXT:    andl $-32, %esp
+; CHECK-NOFMA-NEXT:    subl $32, %esp
+; CHECK-NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; CHECK-NOFMA-NEXT:    vaddps 8(%ebp), %ymm0, %ymm0
+; CHECK-NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; CHECK-NOFMA-NEXT:    vaddps 40(%ebp), %ymm1, %ymm1
+; CHECK-NOFMA-NEXT:    movl %ebp, %esp
+; CHECK-NOFMA-NEXT:    popl %ebp
+; CHECK-NOFMA-NEXT:    retl
 
-; CHECK-NOT: vmulps
-; CHECK-NOT: vaddps
-; CHECK: vfmaddps
-; CHECK-NOT: vmulps
-; CHECK-NOT: vaddps
-; CHECK: vfmaddps
-; CHECK-NOT: vmulps
-; CHECK-NOT: vaddps
-
-; CHECK-NOFMA-NOT: calll
-; CHECK-NOFMA: vmulps
-; CHECK-NOFMA: vaddps
-; CHECK-NOFMA-NOT: calll
-; CHECK-NOFMA: vmulps
-; CHECK-NOFMA: vaddps
-; CHECK-NOFMA-NOT: calll
 
   %ret = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
   ret <16 x float> %ret
diff --git a/test/CodeGen/X86/xor-icmp.ll b/test/CodeGen/X86/xor-icmp.ll
index 397e5bc10f5b..cd58dd1e7604 100644
--- a/test/CodeGen/X86/xor-icmp.ll
+++ b/test/CodeGen/X86/xor-icmp.ll
@@ -1,21 +1,33 @@
-; RUN: llc < %s -march=x86    | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown   | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
 ; rdar://7367229
 
 define i32 @t(i32 %a, i32 %b) nounwind ssp {
+; X32-LABEL: t:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    testb $64, %al
+; X32-NEXT:    je .LBB0_1
+; X32-NEXT:  # BB#2: # %bb1
+; X32-NEXT:    jmp bar # TAILCALL
+; X32-NEXT:  .LBB0_1: # %bb
+; X32-NEXT:    jmp foo # TAILCALL
+;
+; X64-LABEL: t:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    testb $64, %ah
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # BB#2: # %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp bar # TAILCALL
+; X64-NEXT:  .LBB0_1: # %bb
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp foo # TAILCALL
 entry:
-; X32-LABEL:     t:
-; X32:     xorb
-; X32-NOT: andb
-; X32-NOT: shrb
-; X32:     testb $64
-; X32:     je
-
-; X64-LABEL:     t:
-; X64-NOT: setne
-; X64:     xorl
-; X64:     testb $64
-; X64:     je
   %0 = and i32 %a, 16384
   %1 = icmp ne i32 %0, 0
   %2 = and i32 %b, 16384
@@ -38,20 +50,32 @@ declare i32 @bar(...)
 
 define i32 @t2(i32 %x, i32 %y) nounwind ssp {
 ; X32-LABEL: t2:
-; X32: cmpl
-; X32: sete
-; X32: cmpl
-; X32: sete
-; X32-NOT: xor
-; X32: je
-
+; X32:       # BB#0: # %entry
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %cl
+; X32-NEXT:    cmpb %al, %cl
+; X32-NEXT:    je .LBB1_1
+; X32-NEXT:  # BB#2: # %bb
+; X32-NEXT:    jmp foo # TAILCALL
+; X32-NEXT:  .LBB1_1: # %return
+; X32-NEXT:    retl
+;
 ; X64-LABEL: t2:
-; X64: testl
-; X64: sete
-; X64: testl
-; X64: sete
-; X64-NOT: xor
-; X64: je
+; X64:       # BB#0: # %entry
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    sete %al
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    sete %cl
+; X64-NEXT:    cmpb %al, %cl
+; X64-NEXT:    je .LBB1_1
+; X64-NEXT:  # BB#2: # %bb
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp foo # TAILCALL
+; X64-NEXT:  .LBB1_1: # %return
+; X64-NEXT:    retq
+
 entry:
   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
   %1 = icmp eq i32 %y, 0                          ; <i1> [#uses=1]
diff --git a/test/DebugInfo/MIR/AArch64/clobber-sp.mir b/test/DebugInfo/MIR/AArch64/clobber-sp.mir
new file mode 100644
index 000000000000..444faee81cb3
--- /dev/null
+++ b/test/DebugInfo/MIR/AArch64/clobber-sp.mir
@@ -0,0 +1,181 @@
+# RUN: llc -start-after=livedebugvalues -filetype=obj -o - %s \
+# RUN:   | llvm-dwarfdump - | FileCheck %s
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_formal_parameter
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NEXT: DW_AT_location [DW_FORM_data4]	([[LOC:.*]])
+# CHECK-NEXT: DW_AT_name {{.*}}"y"
+# CHECK: .debug_loc contents:
+# CHECK: [[LOC]]:
+# CHECK-SAME:        Beginning address offset: 0x0000000000000000
+# CHECK-NEXT:		Ending address offset: 0x0000000000000014
+# CHECK-NEXT:		 Location description: 51
+#                                              reg1
+#
+# The range of y's [SP+8] location must not be interrupted by the call to h.
+# CHECK:	     Beginning address offset: 0x0000000000000014
+# CHECK-NEXT:		Ending address offset: 0x0000000000000038
+# CHECK-NEXT:   	 Location description: 8f 08
+#                                              breg31 +8
+--- |
+  ; Generated at -Os from:
+  ;   struct Rect {
+  ;     double x, y, w, h;
+  ;   };
+  ;   void g(struct Rect);
+  ;   void h(int *);
+  ;   int f(int x, int y, struct Rect s) {
+  ;     g(s);
+  ;     if (y)
+  ;       h(&x);
+  ;     return 0;
+  ;   }
+  source_filename = "/tmp/clobber.c"
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "arm64-apple-ios"
+  
+  %struct.Rect = type { double, double, double, double }
+  
+  ; Function Attrs: nounwind optsize ssp
+  define i32 @f(i32 %x, i32 %y, [4 x double] %s.coerce) local_unnamed_addr #0 !dbg !7 {
+  entry:
+    %x.addr = alloca i32, align 4
+    tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !19, metadata !22), !dbg !23
+    store i32 %x, i32* %x.addr, align 4, !tbaa !24
+    tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !20, metadata !22), !dbg !28
+    tail call void @llvm.dbg.declare(metadata %struct.Rect* undef, metadata !21, metadata !22), !dbg !29
+    tail call void @g([4 x double] %s.coerce) #4, !dbg !30
+    %tobool = icmp eq i32 %y, 0, !dbg !31
+    br i1 %tobool, label %if.end, label %if.then, !dbg !33
+  
+  if.then:                                          ; preds = %entry
+    tail call void @llvm.dbg.value(metadata i32* %x.addr, i64 0, metadata !19, metadata !22), !dbg !23
+    call void @h(i32* nonnull %x.addr) #4, !dbg !34
+    br label %if.end, !dbg !34
+  
+  if.end:                                           ; preds = %if.then, %entry
+    ret i32 0, !dbg !35
+  }
+  
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  declare void @g([4 x double]) local_unnamed_addr #2
+  declare void @h(i32*) local_unnamed_addr #2
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  attributes #0 = { nounwind optsize ssp }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { optsize }
+  attributes #3 = { nounwind }
+  attributes #4 = { nounwind optsize }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 302682) (llvm/trunk 302683)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "/tmp/clobber.c", directory: "/Volumes/Data/apple-internal/swift")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 2}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"PIC Level", i32 2}
+  !6 = !{!"clang version 5.0.0 (trunk 302682) (llvm/trunk 302683)"}
+  !7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 7, type: !8, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !18)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !10, !10, !11}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Rect", file: !1, line: 1, size: 256, elements: !12)
+  !12 = !{!13, !15, !16, !17}
+  !13 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !11, file: !1, line: 2, baseType: !14, size: 64)
+  !14 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  !15 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !11, file: !1, line: 2, baseType: !14, size: 64, offset: 64)
+  !16 = !DIDerivedType(tag: DW_TAG_member, name: "w", scope: !11, file: !1, line: 2, baseType: !14, size: 64, offset: 128)
+  !17 = !DIDerivedType(tag: DW_TAG_member, name: "h", scope: !11, file: !1, line: 2, baseType: !14, size: 64, offset: 192)
+  !18 = !{!19, !20, !21}
+  !19 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 7, type: !10)
+  !20 = !DILocalVariable(name: "y", arg: 2, scope: !7, file: !1, line: 7, type: !10)
+  !21 = !DILocalVariable(name: "s", arg: 3, scope: !7, file: !1, line: 7, type: !11)
+  !22 = !DIExpression()
+  !23 = !DILocation(line: 7, column: 11, scope: !7)
+  !24 = !{!25, !25, i64 0}
+  !25 = !{!"int", !26, i64 0}
+  !26 = !{!"omnipotent char", !27, i64 0}
+  !27 = !{!"Simple C/C++ TBAA"}
+  !28 = !DILocation(line: 7, column: 18, scope: !7)
+  !29 = !DILocation(line: 7, column: 33, scope: !7)
+  !30 = !DILocation(line: 8, column: 3, scope: !7)
+  !31 = !DILocation(line: 9, column: 7, scope: !32)
+  !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 7)
+  !33 = !DILocation(line: 9, column: 7, scope: !7)
+  !34 = !DILocation(line: 10, column: 5, scope: !32)
+  !35 = !DILocation(line: 12, column: 3, scope: !7)
+
+...
+---
+name:            f
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%w0' }
+  - { reg: '%w1' }
+  - { reg: '%d0' }
+  - { reg: '%d1' }
+  - { reg: '%d2' }
+  - { reg: '%d3' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       32
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:           
+  - { id: 0, name: x.addr, offset: -20, size: 4, alignment: 4, local-offset: -4 }
+  - { id: 1, type: spill-slot, offset: -24, size: 4, alignment: 4 }
+  - { id: 2, type: spill-slot, offset: -8, size: 8, alignment: 8, callee-saved-register: '%lr' }
+  - { id: 3, type: spill-slot, offset: -16, size: 8, alignment: 8, callee-saved-register: '%fp' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.if.end(0x40000000), %bb.1.if.then(0x40000000)
+    liveins: %w0, %w1, %d0, %d1, %d2, %d3, %lr
+  
+    %sp = frame-setup SUBXri %sp, 32, 0
+    frame-setup STPXi killed %fp, killed %lr, %sp, 2 :: (store 8 into %stack.3), (store 8 into %stack.2)
+    %fp = frame-setup ADDXri %sp, 16, 0
+    DBG_VALUE debug-use %w0, debug-use _, !19, !22, debug-location !23
+    STURWi killed %w0, %fp, -4 :: (store 4 into %stack.0.x.addr)
+    DBG_VALUE debug-use %w1, debug-use _, !20, !22, debug-location !28
+    STRWui killed %w1, %sp, 2, debug-location !30 :: (store 4 into %stack.1)
+    DBG_VALUE %sp, 8, !20, !22, debug-location !28
+    BL @g, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit killed %d0, implicit killed %d1, implicit killed %d2, implicit killed %d3, implicit-def %sp, debug-location !30
+    %w0 = LDRWui %sp, 2, debug-location !33 :: (load 4 from %stack.1)
+    CBZW killed %w0, %bb.2.if.end, debug-location !33
+  
+  bb.1.if.then:
+    successors: %bb.2.if.end(0x80000000)
+  
+    DBG_VALUE debug-use %sp, 8, !20, !22, debug-location !28
+    %x0 = SUBXri %fp, 4, 0
+    DBG_VALUE debug-use %x0, debug-use _, !19, !22, debug-location !23
+    BL @h, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit killed %x0, debug-location !34
+  
+  bb.2.if.end:
+    DBG_VALUE debug-use %sp, 8, !20, !22, debug-location !28
+    %w8 = MOVZWi 0, 0
+    %x0 = ORRXrs %xzr, undef %x8, 0, implicit killed %w8, debug-location !35
+    %fp, %lr = LDPXi %sp, 2, debug-location !35 :: (load 8 from %stack.3), (load 8 from %stack.2)
+    %sp = ADDXri %sp, 32, 0, debug-location !35
+    RET undef %lr, implicit killed %w0, debug-location !35
+
+...
diff --git a/test/DebugInfo/MIR/AArch64/lit.local.cfg b/test/DebugInfo/MIR/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..cec29af5bbe4
--- /dev/null
+++ b/test/DebugInfo/MIR/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/DebugInfo/PDB/Inputs/simple-line-info.yaml b/test/DebugInfo/PDB/Inputs/simple-line-info.yaml
index 66030020f8f4..d1324d26d8bb 100644
--- a/test/DebugInfo/PDB/Inputs/simple-line-info.yaml
+++ b/test/DebugInfo/PDB/Inputs/simple-line-info.yaml
@@ -5,39 +5,40 @@ DbiStream:
       ObjFile:         'd:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj'
       SourceFiles:
         - 'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
-      LineInfo:
-        Checksums:
-          - FileName:        'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
-            Kind:            MD5
-            Checksum:        A0A5BD0D3ECD93FC29D19DE826FBF4BC
-          - FileName:        'f:\dd\externalapis\windows\10\sdk\inc\winerror.h'
-            Kind:            MD5
-            Checksum:        1154D69F5B2650196E1FC34F4134E56B
-        Lines:
-          - CodeSize:        10
-            Flags:           [  ]
-            RelocOffset:     16
-            RelocSegment:    1
-            Blocks:
-              - FileName:        'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
-                Lines:
-                  - Offset:          0
-                    LineStart:       5
-                    IsStatement:     true
-                    EndDelta:        0
-                  - Offset:          3
-                    LineStart:       6
-                    IsStatement:     true
-                    EndDelta:        0
-                  - Offset:          8
-                    LineStart:       7
-                    IsStatement:     true
-                    EndDelta:        0
-                Columns:
-        InlineeLines:    
-          - HasExtraFiles:   false
-            Sites:           
-              - FileName:        'f:\dd\externalapis\windows\10\sdk\inc\winerror.h'
-                LineNum:         26950
-                Inlinee:         22767
+      Subsections:
+        - !FileChecksums
+          Checksums:
+            - FileName:        'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
+              Kind:            MD5
+              Checksum:        A0A5BD0D3ECD93FC29D19DE826FBF4BC
+            - FileName:        'f:\dd\externalapis\windows\10\sdk\inc\winerror.h'
+              Kind:            MD5
+              Checksum:        1154D69F5B2650196E1FC34F4134E56B
+        - !Lines
+          CodeSize:        10
+          Flags:           [  ]
+          RelocOffset:     16
+          RelocSegment:    1
+          Blocks:
+            - FileName:        'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
+              Lines:
+                - Offset:          0
+                  LineStart:       5
+                  IsStatement:     true
+                  EndDelta:        0
+                - Offset:          3
+                  LineStart:       6
+                  IsStatement:     true
+                  EndDelta:        0
+                - Offset:          8
+                  LineStart:       7
+                  IsStatement:     true
+                  EndDelta:        0
+              Columns:
+        - !InlineeLines
+          HasExtraFiles:   false
+          Sites:           
+            - FileName:        'f:\dd\externalapis\windows\10\sdk\inc\winerror.h'
+              LineNum:         26950
+              Inlinee:         22767
 ...
diff --git a/test/DebugInfo/PDB/pdbdump-write.test b/test/DebugInfo/PDB/pdbdump-write.test
index f56b4fbe3624..393473a53af1 100644
--- a/test/DebugInfo/PDB/pdbdump-write.test
+++ b/test/DebugInfo/PDB/pdbdump-write.test
@@ -11,10 +11,10 @@
 ; (for example if we don't write the entire stream)
 ;
 ; RUN: llvm-pdbdump pdb2yaml -stream-metadata -stream-directory \
-; RUN:   -pdb-stream -tpi-stream %p/Inputs/empty.pdb > %t.1
+; RUN:   -pdb-stream -tpi-stream -dbi-module-syms %p/Inputs/empty.pdb > %t.1
 ; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2 %t.1
 ; RUN: llvm-pdbdump pdb2yaml -pdb-stream -tpi-stream \
-; RUN:   -no-file-headers %p/Inputs/empty.pdb > %t.3
+; RUN:   -dbi-module-syms -no-file-headers %p/Inputs/empty.pdb > %t.3
 ; RUN: llvm-pdbdump pdb2yaml -pdb-stream -tpi-stream \
-; RUN:   -no-file-headers %t.2 > %t.4
+; RUN:   -dbi-module-syms -no-file-headers %t.2 > %t.4
 ; RUN: diff %t.3 %t.4
diff --git a/test/DebugInfo/PDB/pdbdump-yaml-lineinfo.test b/test/DebugInfo/PDB/pdbdump-yaml-lineinfo.test
index ca7427c0099b..f959805c7474 100644
--- a/test/DebugInfo/PDB/pdbdump-yaml-lineinfo.test
+++ b/test/DebugInfo/PDB/pdbdump-yaml-lineinfo.test
@@ -28,12 +28,8 @@ YAML:   - Module:          'd:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj'
 YAML:     ObjFile:         'd:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj'
 YAML:     SourceFiles:
 YAML:       - 'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
-YAML:     LineInfo:
-YAML:       Checksums:
-YAML:         - FileName:        'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
-YAML:           Kind:            MD5
-YAML:           Checksum:        A0A5BD0D3ECD93FC29D19DE826FBF4BC
-YAML:       Lines:
+YAML:     Subsections:
+YAML:       - !Lines
 YAML:         CodeSize:        10
 YAML:         Flags:           [  ]
 YAML:         RelocOffset:     16
@@ -54,6 +50,11 @@ YAML:                 LineStart:       7
 YAML:                 IsStatement:     true
 YAML:                 EndDelta:        0
 YAML:             Columns:
+YAML:       - !FileChecksums
+YAML:         Checksums:
+YAML:           - FileName:        'd:\src\llvm\test\debuginfo\pdb\inputs\empty.cpp'
+YAML:             Kind:            MD5
+YAML:             Checksum:        A0A5BD0D3ECD93FC29D19DE826FBF4BC
 YAML:  - Module:          '* Linker *'
 YAML:    ObjFile:         ''
 YAML: ...
 \ No newline at end of file
diff --git a/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll b/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll
new file mode 100644
index 000000000000..4df6ffeb5a8c
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll
@@ -0,0 +1,13 @@
+; Test -sanitizer-coverage-inline-8bit-counters=1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1  -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+define void @foo() {
+entry:
+; CHECK:  %0 = load i8, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__sancov_gen_, i64 0, i64 0), !nosanitize
+; CHECK:  %1 = add i8 %0, 1
+; CHECK:  store i8 %1, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__sancov_gen_, i64 0, i64 0), !nosanitize
+  ret void
+}
+; CHECK: call void @__sanitizer_cov_8bit_counters_init(i8* bitcast (i8** @__start___sancov_counters to i8*), i8* bitcast (i8** @__stop___sancov_counters to i8*))
diff --git a/test/MC/WebAssembly/external-data.ll b/test/MC/WebAssembly/external-data.ll
new file mode 100644
index 000000000000..91e05b3f13a6
--- /dev/null
+++ b/test/MC/WebAssembly/external-data.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple wasm32-unknown-unknown-wasm -filetype=obj %s -o - | obj2yaml | FileCheck %s
+; Verify relocations are correctly generated for addresses of externals
+; in the data section.
+
+declare i32 @f1(...)
+
+@foo = global i64 7, align 4
+@far = local_unnamed_addr global i32 (...)* @f1, align 4
+
+; CHECK:   - Type:            DATA
+; CHECK:     Relocations:
+; CHECK:       - Type:            R_WEBASSEMBLY_GLOBAL_ADDR_I32
+; CHECK:         Index:           0
+; CHECK:         Offset:          0x0000000E
+; CHECK:     Segments:
+; CHECK:       - Index:           0
+; CHECK:         Offset:
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           0
+; CHECK:         Content:         0700000000000000FFFFFFFF
+
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
index 0c85322eb565..c19ccb01be3c 100644
--- a/test/ThinLTO/X86/deadstrip.ll
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -22,6 +22,20 @@
 ; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=CHECK2
 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
 
+; RUN: llvm-bcanalyzer -dump %t.out.index.bc | FileCheck %s --check-prefix=COMBINED
+; Live, NotEligibleForImport, Internal
+; COMBINED-DAG: <COMBINED {{.*}} op2=55
+; Live, Internal
+; COMBINED-DAG: <COMBINED {{.*}} op2=39
+; Live, External
+; COMBINED-DAG: <COMBINED {{.*}} op2=32
+; COMBINED-DAG: <COMBINED {{.*}} op2=32
+; COMBINED-DAG: <COMBINED {{.*}} op2=32
+; (Dead)
+; COMBINED-DAG: <COMBINED {{.*}} op2=0
+; COMBINED-DAG: <COMBINED {{.*}} op2=0
+; COMBINED-DAG: <COMBINED {{.*}} op2=0
+
 ; Dead-stripping on the index allows to internalize these,
 ; and limit the import of @baz thanks to early pruning.
 ; CHECK-NOT: available_externally {{.*}} @baz()
@@ -35,7 +49,7 @@
 ; Make sure we didn't internalize @boo, which is reachable via
 ; llvm.global_ctors
 ; CHECK2: define void @boo()
-; We should have eventually revoved @baz since it was internalized and unused
+; We should have eventually removed @baz since it was internalized and unused
 ; CHECK2-NM-NOT: _baz
 
 ; The final binary should not contain any of the dead functions,
diff --git a/test/ThinLTO/X86/newpm-basic.ll b/test/ThinLTO/X86/newpm-basic.ll
index d357cbc85d00..bfcc60c6807b 100644
--- a/test/ThinLTO/X86/newpm-basic.ll
+++ b/test/ThinLTO/X86/newpm-basic.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -module-summary %s -o %t1.bc
 ; RUN: llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:     -r=%t1.bc,_tinkywinky,pxl \
-; RUN:     -lto-use-new-pm
+; RUN:     -use-new-pm
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/Transforms/CodeExtractor/cost.ll b/test/Transforms/CodeExtractor/cost.ll
new file mode 100644
index 000000000000..4ac5acee019a
--- /dev/null
+++ b/test/Transforms/CodeExtractor/cost.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S < %s  -partial-inliner -partial-inlining-extra-penalty=10 | FileCheck %s
+; RUN: opt -S < %s  -passes=partial-inliner -partial-inlining-extra-penalty=10 | FileCheck %s
+define i32 @outline_region_notlikely(i32* %arg) local_unnamed_addr {
+bb:
+;  ptr != null is predicted to be true 
+  %tmp = icmp ne i32* %arg, null
+  br i1 %tmp, label %bb8, label %bb1
+
+; bb1 is not likely
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp3 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp4 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp5 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp6 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp7 = tail call i32 @foo(i32* nonnull %arg)
+  br label %bb8
+
+bb8:                                              ; preds = %bb1, %bb
+  %tmp9 = phi i32 [ 0, %bb1 ], [ 1, %bb ]
+  ret i32 %tmp9
+}
+
+define i32 @outline_region_likely(i32* %arg) local_unnamed_addr {
+bb:
+;  ptr == null is predicted to be false
+  %tmp = icmp eq i32* %arg, null
+  br i1 %tmp, label %bb8, label %bb1
+
+; bb1 is likely
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp3 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp4 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp5 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp6 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp7 = tail call i32 @foo(i32* nonnull %arg)
+  br label %bb8
+
+bb8:                                              ; preds = %bb1, %bb
+  %tmp9 = phi i32 [ 0, %bb1 ], [ 1, %bb ]
+  ret i32 %tmp9
+}
+
+declare i32 @foo(i32* %arg)
+
+define i32 @dummy_caller(i32* %arg) local_unnamed_addr {
+; CHECK-LABEL: @dummy_caller
+  %tmp = call i32 @outline_region_notlikely(i32* %arg)
+; CHECK:  call void @outline_region_notlikely.2_bb1
+  %tmp2 = tail call i32 @outline_region_likely(i32* %arg)
+; CHECK: %tmp2 = tail call i32 @outline_region_likely(i32* %arg)
+  ret i32 %tmp
+
+}
+
+; CHECK-LABEL: define internal void @outline_region_notlikely.2_bb1(i32* %arg) {
+; CHECK-NEXT: newFuncRoot:
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
diff --git a/test/Transforms/CodeExtractor/cost_meta.ll b/test/Transforms/CodeExtractor/cost_meta.ll
new file mode 100644
index 000000000000..2e4467a8d0c9
--- /dev/null
+++ b/test/Transforms/CodeExtractor/cost_meta.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S < %s  -partial-inliner -partial-inlining-extra-penalty=2000 | FileCheck %s
+; RUN: opt -S < %s  -passes=partial-inliner -partial-inlining-extra-penalty=2000 | FileCheck %s
+define i32 @outline_region_notlikely(i32* %arg) local_unnamed_addr {
+bb:
+;  ptr != null is predicted to be true 
+  %tmp = icmp ne i32* %arg, null
+  br i1 %tmp, label %bb8, label %bb1, !prof !2
+
+; bb1 is not likely
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp3 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp4 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp5 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp6 = tail call i32 @foo(i32* nonnull %arg)
+  %tmp7 = tail call i32 @foo(i32* nonnull %arg)
+  br label %bb8
+
+bb8:                                              ; preds = %bb1, %bb
+  %tmp9 = phi i32 [ 0, %bb1 ], [ 1, %bb ]
+  ret i32 %tmp9
+}
+
+define i32 @dummy_caller(i32* %arg) local_unnamed_addr {
+; CHECK-LABEL: @dummy_caller
+  %tmp = call i32 @outline_region_notlikely(i32* %arg)
+  ret i32 %tmp
+ }
+
+
+; CHECK-LABEL: define internal void @outline_region_notlikely.1_bb1(i32* %arg) {
+; CHECK-NEXT: newFuncRoot:
+
+declare i32 @foo(i32 * %arg)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 304489)"}
+!2 = !{!"branch_weights", i32 2000, i32 1}
diff --git a/test/Transforms/Coroutines/coro-split-02.ll b/test/Transforms/Coroutines/coro-split-02.ll
index 953c25088652..4dc8921cd69a 100644
--- a/test/Transforms/Coroutines/coro-split-02.ll
+++ b/test/Transforms/Coroutines/coro-split-02.ll
@@ -1,5 +1,6 @@
 ; Tests that coro-split can handle the case when a code after coro.suspend uses
 ; a value produces between coro.save and coro.suspend (%Result.i19)
+; and checks whether stray coro.saves are properly removed
 ; RUN: opt < %s -coro-split -S | FileCheck %s
 
 %"struct.std::coroutine_handle" = type { i8* }
@@ -24,9 +25,10 @@ entry:
     i8 1, label %exit
   ]
 await.ready:
+  %StrayCoroSave = call token @llvm.coro.save(i8* null)
   %val = load i32, i32* %Result.i19
   call void @print(i32 %val)
-  br label %exit  
+  br label %exit
 exit:
   call i1 @llvm.coro.end(i8* null, i1 false)
   ret void
@@ -35,6 +37,7 @@ exit:
 ; CHECK-LABEL: @a.resume(
 ; CHECK:         getelementptr inbounds %a.Frame
 ; CHECK-NEXT:    getelementptr inbounds %"struct.lean_future<int>::Awaiter"
+; CHECK-NOT:     call token @llvm.coro.save(i8* null)
 ; CHECK-NEXT:    %val = load i32, i32* %Result
 ; CHECK-NEXT:    call void @print(i32 %val)
 ; CHECK-NEXT:    ret void
diff --git a/test/Transforms/Inline/AArch64/switch.ll b/test/Transforms/Inline/AArch64/switch.ll
index 96d6bf2db682..a530ba734705 100644
--- a/test/Transforms/Inline/AArch64/switch.ll
+++ b/test/Transforms/Inline/AArch64/switch.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true | FileCheck %s
-; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true | FileCheck %s
+; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux  | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux | FileCheck %s
 
 define i32 @callee_range(i32 %a, i32* %P) {
   switch i32 %a, label %sw.default [
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index 6ff0a50318d2..8352c07a816b 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -33,17 +33,46 @@ define i1 @invert_fcmp(float %X, float %Y) {
 
 ; PR2298
 
-define zeroext i8 @test6(i32 %a, i32 %b) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i32 %b, %a
-; CHECK-NEXT:    [[RETVAL67:%.*]] = zext i1 [[TMP3]] to i8
-; CHECK-NEXT:    ret i8 [[RETVAL67]]
+define i1 @not_not_cmp(i32 %a, i32 %b) {
+; CHECK-LABEL: @not_not_cmp(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %b, %a
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %cmp = icmp slt i32 %nota, %notb
+  ret i1 %cmp
+}
+
+define <2 x i1> @not_not_cmp_vector(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @not_not_cmp_vector(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i32> %b, %a
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %nota = xor <2 x i32> %a, <i32 -1, i32 -1>
+  %notb = xor <2 x i32> %b, <i32 -1, i32 -1>
+  %cmp = icmp ugt <2 x i32> %nota, %notb
+  ret <2 x i1> %cmp
+}
+
+define i1 @not_cmp_constant(i32 %a) {
+; CHECK-LABEL: @not_cmp_constant(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 %a, -43
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %nota = xor i32 %a, -1
+  %cmp = icmp ugt i32 %nota, 42
+  ret i1 %cmp
+}
+
+define <2 x i1> @not_cmp_constant_vector(<2 x i32> %a) {
+; CHECK-LABEL: @not_cmp_constant_vector(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> %a, <i32 -43, i32 -43>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %tmp1not = xor i32 %a, -1
-  %tmp2not = xor i32 %b, -1
-  %tmp3 = icmp slt i32 %tmp1not, %tmp2not
-  %retval67 = zext i1 %tmp3 to i8
-  ret i8 %retval67
+  %nota = xor <2 x i32> %a, <i32 -1, i32 -1>
+  %cmp = icmp slt <2 x i32> %nota, <i32 42, i32 42>
+  ret <2 x i1> %cmp
 }
 
 define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) {
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index d6f1b634102f..20ebd36991a5 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -1278,3 +1278,19 @@ define void @icmp_slt_sge_or(i32 %Ax, i32 %Bx) {
 ; CHECK: call void @helper_i1(i1 true)
   ret void
 }
+
+define i1 @constant_fold_inttoptr_null() {
+; CHECK-LABEL: @constant_fold_inttoptr_null(
+; CHECK-NEXT:    ret i1 false
+;
+  %x = icmp eq i32* inttoptr (i64 32 to i32*), null
+  ret i1 %x
+}
+
+define i1 @constant_fold_null_inttoptr() {
+; CHECK-LABEL: @constant_fold_null_inttoptr(
+; CHECK-NEXT:    ret i1 false
+;
+  %x = icmp eq i32* null, inttoptr (i64 32 to i32*)
+  ret i1 %x
+}
diff --git a/test/Transforms/LowerExpectIntrinsic/phi_merge.ll b/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
new file mode 100644
index 000000000000..3b407c0f3a5a
--- /dev/null
+++ b/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
@@ -0,0 +1,356 @@
+; RUN: opt -lower-expect  -S -o - < %s | FileCheck %s
+; RUN: opt -S -passes='function(lower-expect)' < %s | FileCheck %s
+
+; The C case
+; if (__builtin_expect((x > goo() && y > hoo() && z > too()), 1)) 
+; For the above case, all 3 branches should be annotated.
+;
+; if (__builtin_expect((x > goo() && y > hoo() && z > too()), 0)) 
+; For the above case, we don't have enough information, so
+; only the last branch is annotated.
+
+define void @foo(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3) {
+; CHECK-LABEL: void @foo
+bb:
+  %tmp8 = call i32  @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: !prof [[WEIGHT:![0-9]+]]
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13, {{.*}}!prof [[WEIGHT]]
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i32  @too()
+  %tmp17 = icmp sgt i32 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i1 [ false, %bb10 ], [ false, %bb ], [ %tmp17, %bb14 ]
+  %tmp20 = xor i1 %tmp19, true
+  %tmp21 = xor i1 %tmp20, true
+  %tmp22 = zext i1 %tmp21 to i32
+  %tmp23 = sext i32 %tmp22 to i64
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 1)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32  @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32  @hoo()
+  br label %bb30
+
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+define void @foo2(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3) {
+; CHECK-LABEL: void @foo2
+bb:
+  %tmp8 = call i32  @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK:  br i1 %tmp9
+; CHECK-NOT: !prof
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13
+; CHECK-NOT: !prof
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i32 @too()
+  %tmp17 = icmp sgt i32 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i1 [ false, %bb10 ], [ false, %bb ], [ %tmp17, %bb14 ]
+  %tmp20 = xor i1 %tmp19, true
+  %tmp21 = xor i1 %tmp20, true
+  %tmp22 = zext i1 %tmp21 to i32
+  %tmp23 = sext i32 %tmp22 to i64
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 0)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT2:![0-9]+]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32 @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32 @hoo()
+  br label %bb30
+
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+define void @foo_i32(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3) {
+; CHECK-LABEL: void @foo_i32
+bb:
+  %tmp8 = call i32  @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: !prof [[WEIGHT]]
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32 @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13, {{.*}}!prof [[WEIGHT]]
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i32 @too()
+  %tmp17 = icmp sgt i32 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i32 [ 5, %bb10 ], [ 5, %bb ], [ %tmp16, %bb14 ]
+  %tmp23 = sext i32 %tmp19 to i64
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 4)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32 @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32 @hoo()
+  br label %bb30
+
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+
+define void @foo_i32_not_unlikely(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo_i32_not_unlikely
+bb:
+  %tmp8 = call i32 @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: br i1 %tmp9
+; CHECK-NOT: !prof
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32 @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13
+; CHECK-NOT: !prof
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i32  @too()
+  %tmp17 = icmp sgt i32 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i32 [ 4, %bb10 ], [ 4, %bb ], [ %tmp16, %bb14 ]
+  %tmp23 = sext i32 %tmp19 to i64
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 4)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32  @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32 @hoo()
+  br label %bb30
+
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+define void @foo_i32_xor(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo_i32_xor
+bb:
+  %tmp8 = call i32  @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: br i1 %tmp9,{{.*}}!prof [[WEIGHT]]
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13,{{.*}}!prof [[WEIGHT]]
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i32  @too()
+  %tmp17 = icmp sgt i32 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i32 [ 6, %bb10 ], [ 6, %bb ], [ %tmp16, %bb14 ]
+  %tmp20 = xor i32 %tmp19, 3
+  %tmp23 = sext i32 %tmp20 to i64
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 4)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32 @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32 @hoo()
+  br label %bb30
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+define void @foo_i8_sext(i32 %arg, i32 %arg1, i8 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo_i8_sext
+bb:
+  %tmp8 = call i32  @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: br i1 %tmp9,{{.*}}!prof [[WEIGHT]]
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13,{{.*}}!prof [[WEIGHT]]
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i8  @too8()
+  %tmp17 = icmp sgt i8 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i8 [ 255, %bb10 ], [ 255, %bb ], [ %tmp16, %bb14 ]
+  %tmp23 = sext i8 %tmp19 to i64
+; after sign extension, the operand value becomes -1 which does not match 255
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 255)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32 @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32 @hoo()
+  br label %bb30
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+define void @foo_i8_sext_not_unlikely(i32 %arg, i32 %arg1, i8 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo_i8_sext_not_unlikely
+bb:
+  %tmp8 = call i32  @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: br i1 %tmp9
+; CHECK-NOT: !prof
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13
+; CHECK-NOT: !prof
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i8  @too8()
+  %tmp17 = icmp sgt i8 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i8 [ 255, %bb10 ], [ 255, %bb ], [ %tmp16, %bb14 ]
+  %tmp23 = sext i8 %tmp19 to i64
+; after sign extension, the operand value becomes -1 which matches -1
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 -1)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32 @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32 @hoo()
+  br label %bb30
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+
+define void @foo_i32_xor_not_unlikely(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo_i32_xor_not_unlikely
+bb:
+  %tmp8 = call i32 @goo() 
+  %tmp9 = icmp sgt i32 %tmp8, %arg
+  br i1 %tmp9, label %bb10, label %bb18
+; CHECK: br i1 %tmp9
+; CHECK-NOT: !prof
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br i1 %tmp13, label %bb14, label %bb18
+; CHECK: br i1 %tmp13
+; CHECK-NOT: !prof
+
+bb14:                                             ; preds = %bb10
+  %tmp16 = call i32 @too()
+  %tmp17 = icmp sgt i32 %arg2, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb14, %bb10, %bb
+  %tmp19 = phi i32 [ 6, %bb10 ], [ 6, %bb ], [ %tmp16, %bb14 ]
+  %tmp20 = xor i32 %tmp19, 2
+  %tmp23 = sext i32 %tmp20 to i64
+  %tmp24 = call i64 @llvm.expect.i64(i64 %tmp23, i64 4)
+  %tmp25 = icmp ne i64 %tmp24, 0
+  br i1 %tmp25, label %bb26, label %bb28
+; CHECK: br i1 %tmp25,{{.*}}!prof [[WEIGHT]]
+
+bb26:                                             ; preds = %bb18
+  %tmp27 = call i32 @goo()
+  br label %bb30
+
+bb28:                                             ; preds = %bb18
+  %tmp29 = call i32  @hoo()
+  br label %bb30
+
+bb30:                                             ; preds = %bb28, %bb26
+  ret void
+}
+
+declare i32 @goo()
+
+declare i32 @hoo()
+
+declare i32 @too()
+
+declare i8 @too8()
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.expect.i64(i64, i64) 
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 302965)"}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/test/Transforms/LowerExpectIntrinsic/phi_or.ll b/test/Transforms/LowerExpectIntrinsic/phi_or.ll
new file mode 100644
index 000000000000..849baef3dca8
--- /dev/null
+++ b/test/Transforms/LowerExpectIntrinsic/phi_or.ll
@@ -0,0 +1,103 @@
+; RUN: opt -lower-expect  -S -o - < %s | FileCheck %s
+; RUN: opt -S -passes='function(lower-expect)' < %s | FileCheck %s
+; 
+; if (__builtin_expect((x > goo() || y > hoo()), 1)) {
+;  ..
+; }
+; For the above case, only the second branch should be
+; annotated.
+; if (__builtin_expect((x > goo() || y > hoo()), 0)) {
+;  ..
+; }
+; For the above case, two branches should be annotated.
+; Function Attrs: noinline nounwind uwtable
+define void @foo(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo
+bb:
+  %tmp8 = call i32 @goo()
+  %tmp9 = icmp slt i32 %arg, %tmp8
+  br i1 %tmp9, label %bb14, label %bb10
+; CHECK: br i1 %tmp9
+; CHECK-NOT: br i1 %tmp9{{.*}}!prof
+
+bb10:                                             ; preds = %bb
+  %tmp12 = call i32  @hoo()
+  %tmp13 = icmp sgt i32 %arg1, %tmp12
+  br label %bb14
+
+bb14:                                             ; preds = %bb10, %bb
+  %tmp15 = phi i1 [ true, %bb ], [ %tmp13, %bb10 ]
+  %tmp16 = zext i1 %tmp15 to i32
+  %tmp17 = sext i32 %tmp16 to i64
+  %expect = call i64 @llvm.expect.i64(i64 %tmp17, i64 1)
+  %tmp18 = icmp ne i64 %expect, 0
+  br i1 %tmp18, label %bb19, label %bb21
+; CHECK: br i1 %tmp18{{.*}}!prof [[WEIGHT:![0-9]+]]
+
+bb19:                                             ; preds = %bb14
+  %tmp20 = call i32 @goo()
+  br label %bb23
+
+bb21:                                             ; preds = %bb14
+  %tmp22 = call i32  @hoo()
+  br label %bb23
+
+bb23:                                             ; preds = %bb21, %bb19
+  ret void
+}
+
+define void @foo2(i32 %arg, i32 %arg1, i32 %arg2, i32 %arg3)  {
+; CHECK-LABEL: void @foo2
+bb:
+  %tmp = alloca i32, align 4
+  %tmp4 = alloca i32, align 4
+  %tmp5 = alloca i32, align 4
+  %tmp6 = alloca i32, align 4
+  store i32 %arg, i32* %tmp, align 4
+  store i32 %arg1, i32* %tmp4, align 4
+  store i32 %arg2, i32* %tmp5, align 4
+  store i32 %arg3, i32* %tmp6, align 4
+  %tmp7 = load i32, i32* %tmp, align 4
+  %tmp8 = call i32  @goo()
+  %tmp9 = icmp slt i32 %tmp7, %tmp8
+  br i1 %tmp9, label %bb14, label %bb10
+; CHECK: br i1 %tmp9{{.*}}!prof [[WEIGHT2:![0-9]+]]
+
+bb10:                                             ; preds = %bb
+  %tmp11 = load i32, i32* %tmp5, align 4
+  %tmp12 = call i32 @hoo()
+  %tmp13 = icmp sgt i32 %tmp11, %tmp12
+  br label %bb14
+
+bb14:                                             ; preds = %bb10, %bb
+  %tmp15 = phi i1 [ true, %bb ], [ %tmp13, %bb10 ]
+  %tmp16 = zext i1 %tmp15 to i32
+  %tmp17 = sext i32 %tmp16 to i64
+  %expect = call i64 @llvm.expect.i64(i64 %tmp17, i64 0)
+  %tmp18 = icmp ne i64 %expect, 0
+  br i1 %tmp18, label %bb19, label %bb21
+; CHECK: br i1 %tmp18{{.*}}!prof [[WEIGHT2]]
+
+bb19:                                             ; preds = %bb14
+  %tmp20 = call i32 @goo()
+  br label %bb23
+
+bb21:                                             ; preds = %bb14
+  %tmp22 = call i32 @hoo()
+  br label %bb23
+
+bb23:                                             ; preds = %bb21, %bb19
+  ret void
+}
+
+declare i32 @goo() 
+declare i32 @hoo() 
+declare i64 @llvm.expect.i64(i64, i64) 
+
+
+!llvm.ident = !{!0}
+
+
+!0 = !{!"clang version 5.0.0 (trunk 302965)"}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/test/Transforms/LowerExpectIntrinsic/phi_tern.ll b/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
new file mode 100644
index 000000000000..3c603d51b438
--- /dev/null
+++ b/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
@@ -0,0 +1,56 @@
+; RUN: opt -lower-expect  -S -o - < %s | FileCheck %s
+; RUN: opt -S -passes='function(lower-expect)' < %s | FileCheck %s
+
+; return __builtin_expect((a > b ? 1, goo(), 0);
+;  
+; Function Attrs: noinline nounwind uwtable
+define i32 @foo(i32 %arg, i32 %arg1)  {
+; CHECK-LABEL: i32 @foo
+bb:
+  %tmp5 = icmp sgt i32 %arg, %arg1
+  br i1 %tmp5, label %bb9, label %bb7
+; CHECK: br i1 %tmp5{{.*}}!prof [[WEIGHT:![0-9]+]]
+
+bb7:                                              ; preds = %bb
+  %tmp8 = call i32 @goo()
+  br label %bb9
+
+bb9:                                              ; preds = %bb7, %bb9
+  %tmp10 = phi i32 [ 1, %bb ], [ %tmp8, %bb7 ]
+  %tmp11 = sext i32 %tmp10 to i64
+  %expect = call i64 @llvm.expect.i64(i64 %tmp11, i64 0)
+  %tmp12 = trunc i64 %expect to i32
+  ret i32 %tmp12
+}
+
+define i32 @foo2(i32 %arg, i32 %arg1)  {
+bb:
+  %tmp5 = icmp sgt i32 %arg, %arg1
+  br i1 %tmp5, label %bb6, label %bb7
+; CHECK: br i1 %tmp5{{.*}}!prof [[WEIGHT:![0-9]+]]
+
+bb6:                                              ; preds = %bb
+  br label %bb9
+
+bb7:                                              ; preds = %bb
+  %tmp8 = call i32 @goo()
+  br label %bb9
+
+bb9:                                              ; preds = %bb7, %bb6
+  %tmp10 = phi i32 [ 1, %bb6 ], [ %tmp8, %bb7 ]
+  %tmp11 = sext i32 %tmp10 to i64
+  %expect = call i64 @llvm.expect.i64(i64 %tmp11, i64 0)
+  %tmp12 = trunc i64 %expect to i32
+  ret i32 %tmp12
+}
+
+declare i32 @goo() 
+declare i64 @llvm.expect.i64(i64, i64) 
+
+
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 302965)"}
+
+; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
index b7a1d208fc6f..cfac37986bda 100644
--- a/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
+++ b/test/Transforms/LowerTypeTests/Inputs/import-unsat.yaml
@@ -1,7 +1,8 @@
 ---
 GlobalValueMap:
   42:
-    - TypeTests: [123]
+    - Live: true
+      TypeTests: [123]
 TypeIdMap:
   typeid1:
     TTRes:
diff --git a/test/Transforms/LowerTypeTests/Inputs/use-typeid1-dead.yaml b/test/Transforms/LowerTypeTests/Inputs/use-typeid1-dead.yaml
new file mode 100644
index 000000000000..7baa02ada86c
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/Inputs/use-typeid1-dead.yaml
@@ -0,0 +1,7 @@
+---
+GlobalValueMap:
+  42:
+    - Live: false
+      TypeTests: [14276520915468743435] # guid("typeid1")
+WithGlobalValueDeadStripping: true
+...
diff --git a/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml b/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml
index 031b2e8de04e..f30257cfc0d4 100644
--- a/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml
+++ b/test/Transforms/LowerTypeTests/Inputs/use-typeid1-typeid2.yaml
@@ -1,5 +1,6 @@
 ---
 GlobalValueMap:
   42:
-    - TypeTests: [14276520915468743435, 15427464259790519041] # guid("typeid1"), guid("typeid2")
+    - Live: true
+      TypeTests: [14276520915468743435, 15427464259790519041] # guid("typeid1"), guid("typeid2")
 ...
diff --git a/test/Transforms/LowerTypeTests/export-dead.ll b/test/Transforms/LowerTypeTests/export-dead.ll
new file mode 100644
index 000000000000..265402b34a6e
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/export-dead.ll
@@ -0,0 +1,14 @@
+; The only use of "typeid1" is in a dead function. Export nothing.
+
+; RUN: opt -S -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/use-typeid1-dead.yaml -lowertypetests-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+@foo = constant i32 42, !type !0
+
+!0 = !{i32 0, !"typeid1"}
+
+; CHECK-NOT: @__typeid_typeid1_global_addr =
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT: WithGlobalValueDeadStripping: true
+; SUMMARY-NEXT: ...
diff --git a/test/Transforms/LowerTypeTests/export-nothing.ll b/test/Transforms/LowerTypeTests/export-nothing.ll
index 9ab41b5f6cb6..8ad331539942 100644
--- a/test/Transforms/LowerTypeTests/export-nothing.ll
+++ b/test/Transforms/LowerTypeTests/export-nothing.ll
@@ -4,4 +4,5 @@
 ; CHECK: ---
 ; CHECK-NEXT: GlobalValueMap:
 ; CHECK-NEXT: TypeIdMap:
+; CHECK-NEXT: WithGlobalValueDeadStripping: false
 ; CHECK-NEXT: ...
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 76b244001986..6cb9b26fb574 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -4,7 +4,10 @@
 
 ; SUMMARY:      GlobalValueMap:
 ; SUMMARY-NEXT:   42:
-; SUMMARY-NEXT:     - TypeTests: [ 123 ]
+; SUMMARY-NEXT:    - Linkage:             0
+; SUMMARY-NEXT:      NotEligibleToImport: false
+; SUMMARY-NEXT:      Live:                true
+; SUMMARY-NEXT:      TypeTests: [ 123 ]
 ; SUMMARY-NEXT: TypeIdMap:
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
diff --git a/test/Transforms/SROA/address-spaces.ll b/test/Transforms/SROA/address-spaces.ll
index 119f2252d95e..8fba30c2720f 100644
--- a/test/Transforms/SROA/address-spaces.ll
+++ b/test/Transforms/SROA/address-spaces.ll
@@ -83,3 +83,21 @@ define void @pr27557() {
   store i32 addrspace(3)* @l, i32 addrspace(3)** %3, align 8
   ret void
 }
+
+; Make sure pre-splitting doesn't try to introduce an illegal bitcast
+define float @presplit(i64 addrspace(1)* %p) {
+entry:
+; CHECK-LABEL: @presplit(
+; CHECK: %[[CAST:.*]] = bitcast i64 addrspace(1)* {{.*}} to i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)* %[[CAST]]
+   %b = alloca i64
+   %b.cast = bitcast i64* %b to [2 x float]*
+   %b.gep1 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 0
+   %b.gep2 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 1
+   %l = load i64, i64 addrspace(1)* %p
+   store i64 %l, i64* %b
+   %f1 = load float, float* %b.gep1
+   %f2 = load float, float* %b.gep2
+   %ret = fadd float %f1, %f2
+   ret float %ret
+}
diff --git a/test/Transforms/Util/PredicateInfo/condprop.ll b/test/Transforms/Util/PredicateInfo/condprop.ll
index 61f59f03e1bc..496bb8385217 100644
--- a/test/Transforms/Util/PredicateInfo/condprop.ll
+++ b/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
-; RUN: opt -print-predicateinfo -analyze -reverse-iterate  < %s 2>&1 | FileCheck %s
 
 @a = external global i32		; <i32*> [#uses=7]
 
diff --git a/test/Transforms/Util/PredicateInfo/condprop2.ll b/test/Transforms/Util/PredicateInfo/condprop2.ll
new file mode 100644
index 000000000000..415fa7c879e3
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/condprop2.ll
@@ -0,0 +1,474 @@
+; REQUIRES: asserts
+; NOTE: The flag -reverse-iterate is present only in a +Asserts build.
+; Hence, this test has been split from condprop.ll to test with -reverse-iterate.
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze -reverse-iterate  < %s 2>&1 | FileCheck %s
+
+@a = external global i32		; <i32*> [#uses=7]
+
+define i32 @test1() nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB:%.*]], label [[BB1:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB8:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 5
+; CHECK-NEXT:    br i1 [[TMP3]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 5
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 5
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB6:%.*]], label [[BB7:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb8:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP12]], [[BB7]] ], [ [[TMP11]], [[BB6]] ], [ [[TMP7]], [[BB4]] ], [ 4, [[BB2]] ], [ 5, [[BB]] ]
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 [[DOT0]]
+;
+entry:
+  %0 = load i32, i32* @a, align 4
+  %1 = icmp eq i32 %0, 4
+  br i1 %1, label %bb, label %bb1
+
+bb:		; preds = %entry
+  br label %bb8
+
+bb1:		; preds = %entry
+  %2 = load i32, i32* @a, align 4
+  %3 = icmp eq i32 %2, 5
+  br i1 %3, label %bb2, label %bb3
+
+bb2:		; preds = %bb1
+  br label %bb8
+
+bb3:		; preds = %bb1
+  %4 = load i32, i32* @a, align 4
+  %5 = icmp eq i32 %4, 4
+  br i1 %5, label %bb4, label %bb5
+
+bb4:		; preds = %bb3
+  %6 = load i32, i32* @a, align 4
+  %7 = add i32 %6, 5
+  br label %bb8
+
+bb5:		; preds = %bb3
+  %8 = load i32, i32* @a, align 4
+  %9 = icmp eq i32 %8, 5
+  br i1 %9, label %bb6, label %bb7
+
+bb6:		; preds = %bb5
+  %10 = load i32, i32* @a, align 4
+  %11 = add i32 %10, 4
+  br label %bb8
+
+bb7:		; preds = %bb5
+  %12 = load i32, i32* @a, align 4
+  br label %bb8
+
+bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
+  %.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
+  br label %return
+
+return:		; preds = %bb8
+  ret i32 %.0
+}
+
+declare void @foo(i1)
+declare void @bar(i32)
+
+define void @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK:       both_zero:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  br i1 %z, label %both_zero, label %nope
+both_zero:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
+
+define void @test4(i1 %b, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
+; CHECK:       sw:
+; CHECK:         i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT:%.*]]
+; CHECK-NEXT:    ] Edge: [label [[SW]],label %case1] }
+; CHECK-NEXT:    [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT]] [
+; CHECK-NEXT:    i32 0, label [[CASE0]]
+; CHECK-NEXT:    i32 1, label [[CASE1]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case1:
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       case3:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
+  br i1 %b, label %sw, label %case3
+sw:
+  switch i32 %x, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case0
+  i32 3, label %case3
+  i32 4, label %default
+  ]
+default:
+  call void @bar(i32 %x)
+  ret void
+case0:
+  call void @bar(i32 %x)
+  ret void
+case1:
+  call void @bar(i32 %x)
+  ret void
+case3:
+  call void @bar(i32 %x)
+  ret void
+}
+
+define i1 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = icmp ne i32 %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = icmp eq i32 %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test6(i32 %x, i32 %y) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = icmp ne i32 %x, %y
+  %cmp = icmp eq i32 %x, %y
+  %cmp3 = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test6_fp(float %x, float %y) {
+; CHECK-LABEL: @test6_fp(
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp une float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp oeq float [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = fcmp une float %x, %y
+  %cmp = fcmp oeq float %x, %y
+  %cmp3 = fcmp oeq float  %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = icmp sle i32 %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = icmp sgt i32 %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test7_fp(float %x, float %y) {
+; CHECK-LABEL: @test7_fp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call float @llvm.ssa.copy.f32(float [[X]])
+; CHECK:         [[X_1:%.*]] = call float @llvm.ssa.copy.f32(float [[X]])
+; CHECK:         [[Y_0:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]])
+; CHECK:         [[Y_1:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = fcmp ule float %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = fcmp ogt float %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test8(i32 %x, i32 %y) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = icmp sle i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %cmp3 = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test8_fp(float %x, float %y) {
+; CHECK-LABEL: @test8_fp(
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = fcmp ule float %x, %y
+  %cmp = fcmp ogt float %x, %y
+  %cmp3 = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i32 @test9(i32 %i, i32 %j) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK:         [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
+; CHECK:         [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
+; CHECK-NEXT:    ret i32 [[DIFF]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %cond_true, label %ret
+
+cond_true:
+  %diff = sub i32 %i, %j
+  ret i32 %diff
+
+ret:
+  ret i32 5
+}
+
+define i32 @test10(i32 %j, i32 %i) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK:         [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK:         [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
+; CHECK-NEXT:    ret i32 [[DIFF]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %cond_true, label %ret
+
+cond_true:
+  %diff = sub i32 %i, %j
+  ret i32 %diff
+
+ret:
+  ret i32 5
+}
+
+declare i32 @yogibar()
+
+define i32 @test11(i32 %x) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
+; CHECK:         [[V0_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0]])
+; CHECK:         [[V1_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V1]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 [[V1_0]]
+; CHECK:       next:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]]
+; CHECK:         [[V0_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0_0]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       cond_true2:
+; CHECK-NEXT:    ret i32 [[V0_0_1]]
+; CHECK:       next2:
+; CHECK-NEXT:    ret i32 0
+;
+  %v0 = call i32 @yogibar()
+  %v1 = call i32 @yogibar()
+  %cmp = icmp eq i32 %v0, %v1
+  br i1 %cmp, label %cond_true, label %next
+
+cond_true:
+  ret i32 %v1
+
+next:
+  %cmp2 = icmp eq i32 %x, %v0
+  br i1 %cmp2, label %cond_true2, label %next2
+
+cond_true2:
+  ret i32 %v0
+
+next2:
+  ret i32 0
+}
+
+define i32 @test12(i32 %x) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    br label [[RET:%.*]]
+; CHECK:       cond_false:
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0]], [[COND_TRUE]] ], [ [[X_1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %cond_true, label %cond_false
+
+cond_true:
+  br label %ret
+
+cond_false:
+  br label %ret
+
+ret:
+  %res = phi i32 [ %x, %cond_true ], [ %x, %cond_false ]
+  ret i32 %res
+}
diff --git a/test/Transforms/Util/PredicateInfo/testandor.ll b/test/Transforms/Util/PredicateInfo/testandor.ll
index 43c508670908..c1048cf6d0f6 100644
--- a/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s
-; RUN: opt -print-predicateinfo -reverse-iterate < %s 2>&1 | FileCheck %s
 
 declare void @foo(i1)
 declare void @bar(i32)
diff --git a/test/Transforms/Util/PredicateInfo/testandor2.ll b/test/Transforms/Util/PredicateInfo/testandor2.ll
new file mode 100644
index 000000000000..a03250c2f7a0
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/testandor2.ll
@@ -0,0 +1,214 @@
+; REQUIRES: asserts
+; NOTE: The flag -reverse-iterate is present only in a +Asserts build.
+; Hence, this test has been split from testandor.ll to test with -reverse-iterate.
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -reverse-iterate < %s 2>&1 | FileCheck %s
+
+declare void @foo(i1)
+declare void @bar(i32)
+declare void @llvm.assume(i1)
+
+define void @testor(i32 %x, i32 %y) {
+; CHECK-LABEL: @testor(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK:       oneof:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    ret void
+; CHECK:       neither:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = or i1 %xz, %yz
+  br i1 %z, label %oneof, label %neither
+oneof:
+;; Should not insert on the true edge for or
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+neither:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  call void @foo(i1 %z)
+  ret void
+}
+define void @testand(i32 %x, i32 %y) {
+; CHECK-LABEL: @testand(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+;; Should not insert on the false edge for and
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  call void @foo(i1 %z)
+  ret void
+}
+define void @testandsame(i32 %x, i32 %y) {
+; CHECK-LABEL: @testandsame(
+; CHECK-NEXT:    [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[XLT:%.*]] = icmp slt i32 [[X]], 100
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XGT]], [[XLT]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]])
+; CHECK:         [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]])
+; CHECK:         [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XGT_0]])
+; CHECK-NEXT:    call void @foo(i1 [[XLT_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0_1]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[XGT]])
+; CHECK-NEXT:    call void @foo(i1 [[XLT]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xgt = icmp sgt i32 %x, 0
+  %xlt = icmp slt i32 %x, 100
+  %z = and i1 %xgt, %xlt
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xgt)
+  call void @foo(i1 %xlt)
+  call void @bar(i32 %x)
+  ret void
+nope:
+  call void @foo(i1 %xgt)
+  call void @foo(i1 %xlt)
+  call void @foo(i1 %z)
+  ret void
+}
+
+define void @testandassume(i32 %x, i32 %y) {
+; CHECK-LABEL: @testandassume(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[TMP1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP5]])
+; CHECK:         [[DOT0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP1]])
+; CHECK:         [[DOT01:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP2]])
+; CHECK:         [[DOT02:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP3]])
+; CHECK:         [[DOT03:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP4]])
+; CHECK:         [[DOT04:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]])
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[DOT02]])
+; CHECK-NEXT:    call void @foo(i1 [[DOT03]])
+; CHECK-NEXT:    call void @bar(i32 [[DOT0]])
+; CHECK-NEXT:    call void @bar(i32 [[DOT01]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[DOT04]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  call void @llvm.assume(i1 %z)
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
+
+;; Unlike and/or for branches, assume is *always* true, so we only match and for it
+define void @testorassume(i32 %x, i32 %y) {
+;
+; CHECK-LABEL: @testorassume(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = or i1 %xz, %yz
+  call void @llvm.assume(i1 %z)
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/export.yaml b/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
index 0f6f59de7522..71cf38b216c7 100644
--- a/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
+++ b/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
@@ -1,7 +1,8 @@
 ---
 GlobalValueMap:
   42:
-    - TypeTestAssumeVCalls:
+    - Live: true
+      TypeTestAssumeVCalls:
         - GUID: 14276520915468743435  # typeid1
           Offset: 0
       TypeCheckedLoadVCalls:
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
index 1cb3ad3f134c..30159c5012b0 100644
--- a/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
@@ -1,7 +1,8 @@
 ---
 GlobalValueMap:
   42:
-    - TypeTestAssumeVCalls:
+    - Live: true
+      TypeTestAssumeVCalls:
         - GUID: 123
           Offset: 0
         - GUID: 456
diff --git a/test/Transforms/WholeProgramDevirt/export-nothing.ll b/test/Transforms/WholeProgramDevirt/export-nothing.ll
index e0814efbf9c0..4707eaa17ead 100644
--- a/test/Transforms/WholeProgramDevirt/export-nothing.ll
+++ b/test/Transforms/WholeProgramDevirt/export-nothing.ll
@@ -4,4 +4,5 @@
 ; CHECK: ---
 ; CHECK-NEXT: GlobalValueMap:
 ; CHECK-NEXT: TypeIdMap:
+; CHECK-NEXT: WithGlobalValueDeadStripping: false
 ; CHECK-NEXT: ...
diff --git a/test/Transforms/WholeProgramDevirt/export-single-impl.ll b/test/Transforms/WholeProgramDevirt/export-single-impl.ll
index f4f3fd054c46..15de77381ed1 100644
--- a/test/Transforms/WholeProgramDevirt/export-single-impl.ll
+++ b/test/Transforms/WholeProgramDevirt/export-single-impl.ll
@@ -38,6 +38,7 @@
 ; SUMMARY-NEXT:         Kind:            SingleImpl
 ; SUMMARY-NEXT:         SingleImplName:  'vf4$merged'
 ; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT: WithGlobalValueDeadStripping: false
 ; SUMMARY-NEXT: ...
 
 ; CHECK: @vt1 = constant void (i8*)* @vf1
diff --git a/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll b/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
index 1d7030c41fd0..11b1c5de4d83 100644
--- a/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
+++ b/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
@@ -1,8 +1,7 @@
 ; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
 ; RUN: FileCheck --check-prefix=SUMMARY %s < %t
 
-; SUMMARY:     - TypeTests:
-; SUMMARY-NEXT:  TypeTestAssumeVCalls:
+; SUMMARY-NOT: TypeTests:
 
 ; SUMMARY:      TypeIdMap:
 ; SUMMARY-NEXT:   typeid4:
diff --git a/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll b/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
index 174a573b5b0d..0878d01cce03 100644
--- a/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
+++ b/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
@@ -1,8 +1,7 @@
 ; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
 ; RUN: FileCheck --check-prefix=SUMMARY %s < %t
 
-; SUMMARY:     - TypeTests:
-; SUMMARY-NEXT:  TypeTestAssumeVCalls:
+; SUMMARY-NOT:  TypeTests:
 
 ; SUMMARY:      TypeIdMap:
 ; SUMMARY-NEXT:   typeid3:
diff --git a/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll b/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
index 0785ade28570..3132444a9f36 100644
--- a/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
+++ b/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -o /dev/null %s
 ; RUN: FileCheck %s < %t
 
-; CHECK:     - TypeTests: [ 15427464259790519041, 17525413373118030901 ]
+; CHECK:       TypeTests: [ 15427464259790519041, 17525413373118030901 ]
 ; CHECK-NEXT:  TypeTestAssumeVCalls:
 
 @vt1a = constant void (i8*)* @vf1a, !type !0
diff --git a/test/Transforms/WholeProgramDevirt/import-indir.ll b/test/Transforms/WholeProgramDevirt/import-indir.ll
index 1de9352eeb22..73c982b17893 100644
--- a/test/Transforms/WholeProgramDevirt/import-indir.ll
+++ b/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -4,7 +4,9 @@
 
 ; SUMMARY:     GlobalValueMap:
 ; SUMMARY-NEXT:  42:
-; SUMMARY-NEXT:    - TypeTests:
+; SUMMARY-NEXT:    - Linkage:             0
+; SUMMARY-NEXT:      NotEligibleToImport: false
+; SUMMARY-NEXT:      Live:                true
 ; SUMMARY-NEXT:      TypeTestAssumeVCalls:
 ; SUMMARY-NEXT:        - GUID:            123
 ; SUMMARY-NEXT:          Offset:          0
diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll
index dbec9ab22527..7effb0c801b9 100644
--- a/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/test/tools/llvm-lto2/X86/pipeline.ll
@@ -8,7 +8,7 @@
 
 ; Try the new pass manager LTO default pipeline (make sure the option
 ; is accepted).
-; RUN: llvm-lto2 run %t1.bc -o %t.o -lto-use-new-pm -r %t1.bc,patatino,px
+; RUN: llvm-lto2 run %t1.bc -o %t.o -use-new-pm -r %t1.bc,patatino,px
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 589005943045..e10d112dcf90 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -304,6 +304,9 @@ int main(int argc, char **argv) {
   initializeScalarizeMaskedMemIntrinPass(*Registry);
   initializeExpandReductionsPass(*Registry);
 
+  // Initialize debugging passes.
+  initializeScavengerTestPass(*Registry);
+
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
diff --git a/tools/llvm-config/llvm-config.cpp b/tools/llvm-config/llvm-config.cpp
index 888da7143c9f..08b096afb052 100644
--- a/tools/llvm-config/llvm-config.cpp
+++ b/tools/llvm-config/llvm-config.cpp
@@ -333,7 +333,7 @@ int main(int argc, char **argv) {
   } else {
     ActivePrefix = CurrentExecPrefix;
     ActiveIncludeDir = ActivePrefix + "/include";
-    SmallString<PATH_MAX> path(StringRef(LLVM_TOOLS_INSTALL_DIR));
+    SmallString<256> path(StringRef(LLVM_TOOLS_INSTALL_DIR));
     sys::fs::make_absolute(ActivePrefix, path);
     ActiveBinDir = path.str();
     ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX;
diff --git a/tools/llvm-lto2/llvm-lto2.cpp b/tools/llvm-lto2/llvm-lto2.cpp
index 3d2643db85bd..89f85157e1df 100644
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@@ -99,6 +99,11 @@ static cl::opt<bool> OptRemarksWithHotness(
     cl::desc("Whether to include hotness informations in the remarks.\n"
              "Has effect only if -pass-remarks-output is specified."));
 
+static cl::opt<bool>
+    UseNewPM("use-new-pm",
+             cl::desc("Run LTO passes using the new pass manager"),
+             cl::init(false), cl::Hidden);
+
 static void check(Error E, std::string Msg) {
   if (!E)
     return;
@@ -196,6 +201,7 @@ static int run(int argc, char **argv) {
   Conf.AAPipeline = AAPipeline;
 
   Conf.OptLevel = OptLevel - '0';
+  Conf.UseNewPM = UseNewPM;
   switch (CGOptLevel) {
   case '0':
     Conf.CGOptLevel = CodeGenOpt::None;
@@ -351,7 +357,7 @@ int main(int argc, char **argv) {
 
   // FIXME: This should use llvm::cl subcommands, but it isn't currently
   // possible to pass an argument not associated with a subcommand to a
-  // subcommand (e.g. -lto-use-new-pm).
+  // subcommand (e.g. -use-new-pm).
   if (argc < 2)
     return usage();
 
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index d95eca1aeddb..31c342cd0f5a 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -483,8 +483,8 @@ Error LLVMOutputStyle::dumpStreamBytes() {
     if (SI >= File.getNumStreams())
       return make_error<RawError>(raw_error_code::no_stream);
 
-    auto S = MappedBlockStream::createIndexedStream(File.getMsfLayout(),
-                                                    File.getMsfBuffer(), SI);
+    auto S = MappedBlockStream::createIndexedStream(
+        File.getMsfLayout(), File.getMsfBuffer(), SI, File.getAllocator());
     if (!S)
       continue;
     DictScope DD(P, "Stream");
@@ -791,7 +791,7 @@ Error LLVMOutputStyle::dumpDbiStream() {
       if (HasModuleDI && (ShouldDumpSymbols || opts::raw::DumpLineInfo)) {
         auto ModStreamData = MappedBlockStream::createIndexedStream(
             File.getMsfLayout(), File.getMsfBuffer(),
-            Modi.getModuleStreamIndex());
+            Modi.getModuleStreamIndex(), File.getAllocator());
 
         ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
         if (auto EC = ModS.reload())
@@ -804,7 +804,8 @@ Error LLVMOutputStyle::dumpDbiStream() {
           auto &Types = *ExpectedTypes;
 
           ListScope SS(P, "Symbols");
-          codeview::CVSymbolDumper SD(P, Types, nullptr, false);
+          codeview::CVSymbolDumper SD(P, Types, CodeViewContainer::Pdb, nullptr,
+                                      false);
           bool HadError = false;
           for (auto S : ModS.symbols(&HadError)) {
             DictScope LL(P, "");
@@ -830,8 +831,7 @@ Error LLVMOutputStyle::dumpDbiStream() {
             return ExpectedTypes.takeError();
           auto &IpiItems = *ExpectedTypes;
           C13RawVisitor V(P, File, IpiItems);
-          if (auto EC =
-                  codeview::visitDebugSubsections(ModS.linesAndChecksums(), V))
+          if (auto EC = codeview::visitDebugSubsections(ModS.subsections(), V))
             return EC;
         }
       }
@@ -952,7 +952,7 @@ Error LLVMOutputStyle::dumpPublicsStream() {
     return ExpectedTypes.takeError();
   auto &Tpi = *ExpectedTypes;
 
-  codeview::CVSymbolDumper SD(P, Tpi, nullptr, false);
+  codeview::CVSymbolDumper SD(P, Tpi, CodeViewContainer::Pdb, nullptr, false);
   bool HadError = false;
   for (auto S : Publics->getSymbols(&HadError)) {
     DictScope DD(P, "");
diff --git a/tools/llvm-pdbdump/PdbYaml.cpp b/tools/llvm-pdbdump/PdbYaml.cpp
index e288063e2afa..b4a41fbfdb8f 100644
--- a/tools/llvm-pdbdump/PdbYaml.cpp
+++ b/tools/llvm-pdbdump/PdbYaml.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
@@ -21,6 +22,7 @@
 #include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h"
 #include "llvm/ObjectYAML/CodeViewYAMLTypes.h"
 
 using namespace llvm;
@@ -220,6 +222,6 @@ void MappingTraits<PdbDbiModuleInfo>::mapping(IO &IO, PdbDbiModuleInfo &Obj) {
   IO.mapRequired("Module", Obj.Mod);
   IO.mapOptional("ObjFile", Obj.Obj, Obj.Mod);
   IO.mapOptional("SourceFiles", Obj.SourceFiles);
-  IO.mapOptional("LineInfo", Obj.FileLineInfo);
+  IO.mapOptional("Subsections", Obj.Subsections);
   IO.mapOptional("Modi", Obj.Modi);
 }
diff --git a/tools/llvm-pdbdump/PdbYaml.h b/tools/llvm-pdbdump/PdbYaml.h
index deb500ec2074..62ed608916fc 100644
--- a/tools/llvm-pdbdump/PdbYaml.h
+++ b/tools/llvm-pdbdump/PdbYaml.h
@@ -28,6 +28,9 @@
 #include <vector>
 
 namespace llvm {
+namespace codeview {
+class DebugStringTableSubsection;
+}
 namespace pdb {
 
 namespace yaml {
@@ -68,7 +71,7 @@ struct PdbDbiModuleInfo {
   StringRef Obj;
   StringRef Mod;
   std::vector<StringRef> SourceFiles;
-  Optional<CodeViewYAML::SourceFileInfo> FileLineInfo;
+  std::vector<CodeViewYAML::YAMLDebugSubsection> Subsections;
   Optional<PdbModiStream> Modi;
 };
 
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.cpp b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
index 18839a7679d3..ee72b90b12d1 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.cpp
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
@@ -101,117 +101,6 @@ Error YAMLOutputStyle::dump() {
   return Error::success();
 }
 
-namespace {
-class C13YamlVisitor : public C13DebugFragmentVisitor {
-public:
-  C13YamlVisitor(CodeViewYAML::SourceFileInfo &Info, PDBFile &F)
-      : C13DebugFragmentVisitor(F), Info(Info) {}
-
-  Error handleFileChecksums() override {
-    for (const auto &C : *Checksums) {
-      CodeViewYAML::SourceFileChecksumEntry Entry;
-      if (auto Result = getNameFromStringTable(C.FileNameOffset))
-        Entry.FileName = *Result;
-      else
-        return Result.takeError();
-
-      Entry.Kind = C.Kind;
-      Entry.ChecksumBytes.Bytes = C.Checksum;
-      Info.FileChecksums.push_back(Entry);
-    }
-    return Error::success();
-  }
-
-  Error handleLines() override {
-    for (const auto &LF : Lines) {
-      Info.LineFragments.emplace_back();
-      auto &Fragment = Info.LineFragments.back();
-
-      Fragment.CodeSize = LF.header()->CodeSize;
-      Fragment.Flags =
-          static_cast<codeview::LineFlags>(uint16_t(LF.header()->Flags));
-      Fragment.RelocOffset = LF.header()->RelocOffset;
-      Fragment.RelocSegment = LF.header()->RelocSegment;
-
-      for (const auto &L : LF) {
-        Fragment.Blocks.emplace_back();
-        auto &Block = Fragment.Blocks.back();
-
-        if (auto Result = getNameFromChecksumsBuffer(L.NameIndex))
-          Block.FileName = *Result;
-        else
-          return Result.takeError();
-
-        for (const auto &N : L.LineNumbers) {
-          CodeViewYAML::SourceLineEntry Line;
-          Line.Offset = N.Offset;
-          codeview::LineInfo LI(N.Flags);
-          Line.LineStart = LI.getStartLine();
-          Line.EndDelta = LI.getLineDelta();
-          Line.IsStatement = LI.isStatement();
-          Block.Lines.push_back(Line);
-        }
-
-        if (LF.hasColumnInfo()) {
-          for (const auto &C : L.Columns) {
-            CodeViewYAML::SourceColumnEntry Column;
-            Column.StartColumn = C.StartColumn;
-            Column.EndColumn = C.EndColumn;
-            Block.Columns.push_back(Column);
-          }
-        }
-      }
-    }
-    return Error::success();
-  }
-
-  Error handleInlineeLines() override {
-    for (const auto &ILF : InlineeLines) {
-      Info.Inlinees.emplace_back();
-      auto &Inlinee = Info.Inlinees.back();
-
-      Inlinee.HasExtraFiles = ILF.hasExtraFiles();
-      for (const auto &IL : ILF) {
-        Inlinee.Sites.emplace_back();
-        auto &Site = Inlinee.Sites.back();
-        if (auto Result = getNameFromChecksumsBuffer(IL.Header->FileID))
-          Site.FileName = *Result;
-        else
-          return Result.takeError();
-
-        Site.Inlinee = IL.Header->Inlinee.getIndex();
-        Site.SourceLineNum = IL.Header->SourceLineNum;
-        if (ILF.hasExtraFiles()) {
-          for (const auto &EF : IL.ExtraFiles) {
-            if (auto Result = getNameFromChecksumsBuffer(EF))
-              Site.ExtraFiles.push_back(*Result);
-            else
-              return Result.takeError();
-          }
-        }
-      }
-    }
-    return Error::success();
-  }
-
-private:
-  CodeViewYAML::SourceFileInfo &Info;
-};
-}
-
-Expected<Optional<CodeViewYAML::SourceFileInfo>>
-YAMLOutputStyle::getFileLineInfo(const pdb::ModuleDebugStreamRef &ModS) {
-  if (!ModS.hasLineInfo())
-    return None;
-
-  CodeViewYAML::SourceFileInfo Info;
-  C13YamlVisitor Visitor(Info, File);
-  if (auto EC =
-          codeview::visitDebugSubsections(ModS.linesAndChecksums(), Visitor))
-    return std::move(EC);
-
-  return Info;
-}
 
 Error YAMLOutputStyle::dumpFileHeaders() {
   if (opts::pdb2yaml::NoFileHeaders)
@@ -236,14 +125,17 @@ Error YAMLOutputStyle::dumpFileHeaders() {
 }
 
 Error YAMLOutputStyle::dumpStringTable() {
-  if (!opts::pdb2yaml::StringTable)
+  bool RequiresStringTable = opts::pdb2yaml::DbiModuleSourceFileInfo ||
+                             opts::pdb2yaml::DbiModuleSourceLineInfo;
+  bool RequestedStringTable = opts::pdb2yaml::StringTable;
+  if (!RequiresStringTable && !RequestedStringTable)
     return Error::success();
 
-  Obj.StringTable.emplace();
   auto ExpectedST = File.getStringTable();
   if (!ExpectedST)
     return ExpectedST.takeError();
 
+  Obj.StringTable.emplace();
   const auto &ST = ExpectedST.get();
   for (auto ID : ST.name_ids()) {
     auto S = ST.getStringForID(ID);
@@ -337,17 +229,30 @@ Error YAMLOutputStyle::dumpDbiStream() {
         continue;
 
       auto ModStreamData = msf::MappedBlockStream::createIndexedStream(
-          File.getMsfLayout(), File.getMsfBuffer(), ModiStream);
+          File.getMsfLayout(), File.getMsfBuffer(), ModiStream,
+          File.getAllocator());
 
       pdb::ModuleDebugStreamRef ModS(MI, std::move(ModStreamData));
       if (auto EC = ModS.reload())
         return EC;
 
-      if (opts::pdb2yaml::DbiModuleSourceLineInfo) {
-        auto ExpectedInfo = getFileLineInfo(ModS);
-        if (!ExpectedInfo)
-          return ExpectedInfo.takeError();
-        DMI.FileLineInfo = *ExpectedInfo;
+      auto ExpectedST = File.getStringTable();
+      if (!ExpectedST)
+        return ExpectedST.takeError();
+      if (opts::pdb2yaml::DbiModuleSourceLineInfo &&
+          ModS.hasDebugSubsections()) {
+        auto ExpectedChecksums = ModS.findChecksumsSubsection();
+        if (!ExpectedChecksums)
+          return ExpectedChecksums.takeError();
+
+        for (const auto &SS : ModS.subsections()) {
+          auto Converted =
+              CodeViewYAML::YAMLDebugSubsection::fromCodeViewSubection(
+                  ExpectedST->getStringTable(), *ExpectedChecksums, SS);
+          if (!Converted)
+            return Converted.takeError();
+          DMI.Subsections.push_back(*Converted);
+        }
       }
 
       if (opts::pdb2yaml::DbiModuleSyms) {
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.h b/tools/llvm-pdbdump/YAMLOutputStyle.h
index 6e4067c48f88..3690e3529d4a 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.h
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.h
@@ -27,9 +27,6 @@ public:
   Error dump() override;
 
 private:
-  Expected<Optional<CodeViewYAML::SourceFileInfo>>
-  getFileLineInfo(const pdb::ModuleDebugStreamRef &ModS);
-
   Error dumpStringTable();
   Error dumpFileHeaders();
   Error dumpStreamMetadata();
diff --git a/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp b/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp
index 14cd222d138a..5f09416a9ff6 100644
--- a/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp
+++ b/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp
@@ -85,7 +85,7 @@ extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) {
 
   for (auto &Modi : DS.modules()) {
     auto ModStreamData = pdb::MappedBlockStream::createIndexedStream(
-      Modi.Info.getModuleStreamIndex(), *File);
+        Modi.Info.getModuleStreamIndex(), *File, File->getAllocator());
     if (!ModStreamData) {
       consumeError(ModStreamData.takeError());
       return 0;
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index 0b2b766a3c52..4626de9c4440 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -476,7 +476,6 @@ static void yamlToPdb(StringRef Path) {
   std::unique_ptr<MemoryBuffer> &Buffer = ErrorOrBuffer.get();
 
   llvm::yaml::Input In(Buffer->getBuffer());
-  In.setContext(&Allocator);
   pdb::yaml::PdbObject YamlObj(Allocator);
   In >> YamlObj;
 
@@ -535,67 +534,16 @@ static void yamlToPdb(StringRef Path) {
       ExitOnErr(DbiBuilder.addModuleSourceFile(MI.Mod, S));
     if (MI.Modi.hasValue()) {
       const auto &ModiStream = *MI.Modi;
-      for (auto Symbol : ModiStream.Symbols)
-        ModiBuilder.addSymbol(Symbol.toCodeViewSymbol(Allocator));
-    }
-    if (MI.FileLineInfo.hasValue()) {
-      const auto &FLI = *MI.FileLineInfo;
-
-      // File Checksums must be emitted before line information, because line
-      // info records use offsets into the checksum buffer to reference a file's
-      // source file name.
-      auto Checksums = llvm::make_unique<DebugChecksumsSubsection>(Strings);
-      auto &ChecksumRef = *Checksums;
-      if (!FLI.FileChecksums.empty()) {
-        for (auto &FC : FLI.FileChecksums)
-          Checksums->addChecksum(FC.FileName, FC.Kind, FC.ChecksumBytes.Bytes);
-      }
-      ModiBuilder.setC13FileChecksums(std::move(Checksums));
-
-      for (const auto &Fragment : FLI.LineFragments) {
-        auto Lines =
-            llvm::make_unique<DebugLinesSubsection>(ChecksumRef, Strings);
-        Lines->setCodeSize(Fragment.CodeSize);
-        Lines->setRelocationAddress(Fragment.RelocSegment,
-                                    Fragment.RelocOffset);
-        Lines->setFlags(Fragment.Flags);
-        for (const auto &LC : Fragment.Blocks) {
-          Lines->createBlock(LC.FileName);
-          if (Lines->hasColumnInfo()) {
-            for (const auto &Item : zip(LC.Lines, LC.Columns)) {
-              auto &L = std::get<0>(Item);
-              auto &C = std::get<1>(Item);
-              uint32_t LE = L.LineStart + L.EndDelta;
-              Lines->addLineAndColumnInfo(
-                  L.Offset, LineInfo(L.LineStart, LE, L.IsStatement),
-                  C.StartColumn, C.EndColumn);
-            }
-          } else {
-            for (const auto &L : LC.Lines) {
-              uint32_t LE = L.LineStart + L.EndDelta;
-              Lines->addLineInfo(L.Offset,
-                                 LineInfo(L.LineStart, LE, L.IsStatement));
-            }
-          }
-        }
-        ModiBuilder.addC13Fragment(std::move(Lines));
+      for (auto Symbol : ModiStream.Symbols) {
+        ModiBuilder.addSymbol(
+            Symbol.toCodeViewSymbol(Allocator, CodeViewContainer::Pdb));
       }
+    }
 
-      for (const auto &Inlinee : FLI.Inlinees) {
-        auto Inlinees = llvm::make_unique<DebugInlineeLinesSubsection>(
-            ChecksumRef, Inlinee.HasExtraFiles);
-        for (const auto &Site : Inlinee.Sites) {
-          Inlinees->addInlineSite(TypeIndex(Site.Inlinee), Site.FileName,
-                                  Site.SourceLineNum);
-          if (!Inlinee.HasExtraFiles)
-            continue;
-
-          for (auto EF : Site.ExtraFiles) {
-            Inlinees->addExtraFile(EF);
-          }
-        }
-        ModiBuilder.addC13Fragment(std::move(Inlinees));
-      }
+    auto CodeViewSubsections =
+        ExitOnErr(CodeViewYAML::convertSubsectionList(MI.Subsections, Strings));
+    for (auto &SS : CodeViewSubsections) {
+      ModiBuilder.addDebugSubsection(std::move(SS));
     }
   }
 
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 663f7b4c8a82..bc07bd296ad2 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -978,7 +978,8 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
                                Subsection.bytes_end());
   auto CODD = llvm::make_unique<COFFObjectDumpDelegate>(*this, Section, Obj,
                                                         SectionContents);
-  CVSymbolDumper CVSD(W, Types, std::move(CODD), opts::CodeViewSubsectionBytes);
+  CVSymbolDumper CVSD(W, Types, CodeViewContainer::ObjectFile, std::move(CODD),
+                      opts::CodeViewSubsectionBytes);
   CVSymbolArray Symbols;
   BinaryStreamReader Reader(BinaryData, llvm::support::little);
   if (auto EC = Reader.readArray(Symbols, Reader.getLength())) {
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index 7367ad470e3a..ca6391024f27 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -424,6 +424,16 @@ TYPED_TEST(SmallVectorTest, AssignTest) {
   this->assertValuesInOrder(this->theVector, 2u, 77, 77);
 }
 
+// Assign test
+TYPED_TEST(SmallVectorTest, AssignRangeTest) {
+  SCOPED_TRACE("AssignTest");
+
+  this->theVector.push_back(Constructable(1));
+  int arr[] = {1, 2, 3};
+  this->theVector.assign(std::begin(arr), std::end(arr));
+  this->assertValuesInOrder(this->theVector, 3u, 1, 2, 3);
+}
+
 // Move-assign test
 TYPED_TEST(SmallVectorTest, MoveAssignTest) {
   SCOPED_TRACE("MoveAssignTest");
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index 40d5ea5f5ad7..8082c54b9c66 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt
@@ -9,17 +9,18 @@ add_llvm_unittest(AnalysisTests
   AliasAnalysisTest.cpp
   BlockFrequencyInfoTest.cpp
   BranchProbabilityInfoTest.cpp
+  CallGraphTest.cpp
   CFGTest.cpp
   CGSCCPassManagerTest.cpp
-  CallGraphTest.cpp
   LazyCallGraphTest.cpp
   LoopInfoTest.cpp
   MemoryBuiltinsTest.cpp
   MemorySSA.cpp
+  OrderedBasicBlockTest.cpp
   ProfileSummaryInfoTest.cpp
   ScalarEvolutionTest.cpp
-  TBAATest.cpp
   TargetLibraryInfoTest.cpp
+  TBAATest.cpp
   UnrollAnalyzer.cpp
   ValueTrackingTest.cpp
   )
diff --git a/unittests/Analysis/OrderedBasicBlockTest.cpp b/unittests/Analysis/OrderedBasicBlockTest.cpp
new file mode 100644
index 000000000000..b8b9ff04ce7c
--- /dev/null
+++ b/unittests/Analysis/OrderedBasicBlockTest.cpp
@@ -0,0 +1,58 @@
+//===- OrderedBasicBlockTest.cpp - OrderedBasicBlock unit tests -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+class OrderedBasicBlockTest : public testing::Test {
+protected:
+  LLVMContext C;
+
+  std::unique_ptr<Module> makeLLVMModule() {
+    const char *ModuleString = R"(define i32 @f(i32 %x) {
+                                    %add = add i32 %x, 42
+                                    ret i32 %add
+                                  })";
+    SMDiagnostic Err;
+    auto foo = parseAssemblyString(ModuleString, Err, C);
+    return foo;
+  }
+};
+
+TEST_F(OrderedBasicBlockTest, Basic) {
+  auto M = makeLLVMModule();
+  Function *F = M->getFunction("f");
+  BasicBlock::iterator I = F->front().begin();
+  Instruction *Add = &*I++;
+  Instruction *Ret = &*I++;
+
+  OrderedBasicBlock OBB(&F->front());
+  // Intentionally duplicated to verify cached and uncached are the same.
+  EXPECT_FALSE(OBB.dominates(Add, Add));
+  EXPECT_FALSE(OBB.dominates(Add, Add));
+  EXPECT_TRUE(OBB.dominates(Add, Ret));
+  EXPECT_TRUE(OBB.dominates(Add, Ret));
+  EXPECT_FALSE(OBB.dominates(Ret, Add));
+  EXPECT_FALSE(OBB.dominates(Ret, Add));
+  EXPECT_FALSE(OBB.dominates(Ret, Ret));
+  EXPECT_FALSE(OBB.dominates(Ret, Ret));
+}
+
+} // end anonymous namespace
+} // end namespace llvm
diff --git a/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp b/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
index 9d90e265df33..789fe515b018 100644
--- a/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
+++ b/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
@@ -70,6 +70,8 @@ public:
     return MSFStreamLayout{static_cast<uint32_t>(Data.size()), Blocks};
   }
 
+  BumpPtrAllocator Allocator;
+
 private:
   std::vector<support::ulittle32_t> Blocks;
   MutableArrayRef<uint8_t> Data;
@@ -77,7 +79,8 @@ private:
 
 TEST(MappedBlockStreamTest, NumBlocks) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   EXPECT_EQ(F.block_size(), S->getBlockSize());
   EXPECT_EQ(F.layout().Blocks.size(), S->getNumBlocks());
 
@@ -87,7 +90,8 @@ TEST(MappedBlockStreamTest, NumBlocks) {
 // and does not allocate.
 TEST(MappedBlockStreamTest, ReadBeyondEndOfStreamRef) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
 
   BinaryStreamReader R(*S);
   BinaryStreamRef SR;
@@ -102,13 +106,14 @@ TEST(MappedBlockStreamTest, ReadBeyondEndOfStreamRef) {
 // does not fail due to the length of the output buffer.
 TEST(MappedBlockStreamTest, ReadOntoNonEmptyBuffer) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
 
   BinaryStreamReader R(*S);
   StringRef Str = "ZYXWVUTSRQPONMLKJIHGFEDCBA";
   EXPECT_NO_ERROR(R.readFixedString(Str, 1));
   EXPECT_EQ(Str, StringRef("A"));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 }
 
 // Tests that a read which crosses a block boundary, but where the subsequent
@@ -116,18 +121,18 @@ TEST(MappedBlockStreamTest, ReadOntoNonEmptyBuffer) {
 // not allocate memory.
 TEST(MappedBlockStreamTest, ZeroCopyReadContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 2));
   EXPECT_EQ(Str, StringRef("AB"));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 
   R.setOffset(6);
   EXPECT_NO_ERROR(R.readFixedString(Str, 4));
   EXPECT_EQ(Str, StringRef("GHIJ"));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 }
 
 // Tests that a read which crosses a block boundary and cannot be referenced
@@ -135,62 +140,67 @@ TEST(MappedBlockStreamTest, ZeroCopyReadContiguousBreak) {
 // requested.
 TEST(MappedBlockStreamTest, CopyReadNonContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 10));
   EXPECT_EQ(Str, StringRef("ABCDEFGHIJ"));
-  EXPECT_EQ(10U, S->getNumBytesCopied());
+  EXPECT_EQ(10U, F.Allocator.getBytesAllocated());
 }
 
 // Test that an out of bounds read which doesn't cross a block boundary
 // fails and allocates no memory.
 TEST(MappedBlockStreamTest, InvalidReadSizeNoBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str;
 
   R.setOffset(10);
   EXPECT_ERROR(R.readFixedString(Str, 1));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 }
 
 // Test that an out of bounds read which crosses a contiguous block boundary
 // fails and allocates no memory.
 TEST(MappedBlockStreamTest, InvalidReadSizeContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str;
 
   R.setOffset(6);
   EXPECT_ERROR(R.readFixedString(Str, 5));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 }
 
 // Test that an out of bounds read which crosses a discontiguous block
 // boundary fails and allocates no memory.
 TEST(MappedBlockStreamTest, InvalidReadSizeNonContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str;
 
   EXPECT_ERROR(R.readFixedString(Str, 11));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 }
 
 // Tests that a read which is entirely contained within a single block but
 // beyond the end of a StreamRef fails.
 TEST(MappedBlockStreamTest, ZeroCopyReadNoBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 1));
   EXPECT_EQ(Str, StringRef("A"));
-  EXPECT_EQ(0U, S->getNumBytesCopied());
+  EXPECT_EQ(0U, F.Allocator.getBytesAllocated());
 }
 
 // Tests that a read which is not aligned on the same boundary as a previous
@@ -198,19 +208,20 @@ TEST(MappedBlockStreamTest, ZeroCopyReadNoBreak) {
 // previous allocation.
 TEST(MappedBlockStreamTest, UnalignedOverlappingRead) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str1;
   StringRef Str2;
   EXPECT_NO_ERROR(R.readFixedString(Str1, 7));
   EXPECT_EQ(Str1, StringRef("ABCDEFG"));
-  EXPECT_EQ(7U, S->getNumBytesCopied());
+  EXPECT_EQ(7U, F.Allocator.getBytesAllocated());
 
   R.setOffset(2);
   EXPECT_NO_ERROR(R.readFixedString(Str2, 3));
   EXPECT_EQ(Str2, StringRef("CDE"));
   EXPECT_EQ(Str1.data() + 2, Str2.data());
-  EXPECT_EQ(7U, S->getNumBytesCopied());
+  EXPECT_EQ(7U, F.Allocator.getBytesAllocated());
 }
 
 // Tests that a read which is not aligned on the same boundary as a previous
@@ -218,18 +229,19 @@ TEST(MappedBlockStreamTest, UnalignedOverlappingRead) {
 // still works correctly and allocates again from the shared pool.
 TEST(MappedBlockStreamTest, UnalignedOverlappingReadFail) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F,
+                                           F.Allocator);
   BinaryStreamReader R(*S);
   StringRef Str1;
   StringRef Str2;
   EXPECT_NO_ERROR(R.readFixedString(Str1, 6));
   EXPECT_EQ(Str1, StringRef("ABCDEF"));
-  EXPECT_EQ(6U, S->getNumBytesCopied());
+  EXPECT_EQ(6U, F.Allocator.getBytesAllocated());
 
   R.setOffset(4);
   EXPECT_NO_ERROR(R.readFixedString(Str2, 4));
   EXPECT_EQ(Str2, StringRef("EFGH"));
-  EXPECT_EQ(10U, S->getNumBytesCopied());
+  EXPECT_EQ(10U, F.Allocator.getBytesAllocated());
 }
 
 TEST(MappedBlockStreamTest, WriteBeyondEndOfStream) {
@@ -241,8 +253,8 @@ TEST(MappedBlockStreamTest, WriteBeyondEndOfStream) {
                 "LargeBuffer is not big enough");
 
   DiscontiguousStream F(BlocksAry, Data);
-  auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.layout(), F);
+  auto S = WritableMappedBlockStream::createStream(F.block_size(), F.layout(),
+                                                   F, F.Allocator);
   ArrayRef<uint8_t> Buffer;
 
   EXPECT_ERROR(S->writeBytes(0, ArrayRef<uint8_t>(LargeBuffer)));
@@ -254,8 +266,8 @@ TEST(MappedBlockStreamTest, WriteBeyondEndOfStream) {
 TEST(MappedBlockStreamTest, TestWriteBytesNoBreakBoundary) {
   static uint8_t Data[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'};
   DiscontiguousStream F(BlocksAry, Data);
-  auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.layout(), F);
+  auto S = WritableMappedBlockStream::createStream(F.block_size(), F.layout(),
+                                                   F, F.Allocator);
   ArrayRef<uint8_t> Buffer;
 
   EXPECT_NO_ERROR(S->readBytes(0, 1, Buffer));
@@ -287,8 +299,8 @@ TEST(MappedBlockStreamTest, TestWriteBytesBreakBoundary) {
                                'T', 'G', '.', '0', '0'};
 
   DiscontiguousStream F(BlocksAry, Data);
-  auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.layout(), F);
+  auto S = WritableMappedBlockStream::createStream(F.block_size(), F.layout(),
+                                                   F, F.Allocator);
   ArrayRef<uint8_t> Buffer;
 
   EXPECT_NO_ERROR(S->writeBytes(0, TestData));
@@ -306,8 +318,8 @@ TEST(MappedBlockStreamTest, TestWriteThenRead) {
   const uint32_t Blocks[] = {2, 1, 0, 6, 3, 4, 5, 7, 9, 8};
 
   DiscontiguousStream F(Blocks, Data);
-  auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.layout(), F);
+  auto S = WritableMappedBlockStream::createStream(F.block_size(), F.layout(),
+                                                   F, F.Allocator);
 
   enum class MyEnum : uint32_t { Val1 = 2908234, Val2 = 120891234 };
   using support::ulittle32_t;
@@ -399,7 +411,7 @@ TEST(MappedBlockStreamTest, TestWriteContiguousStreamRef) {
 
   DiscontiguousStream F(DestBlocks, DestData);
   auto DestStream = WritableMappedBlockStream::createStream(
-      F.block_size(), F.layout(), F);
+      F.block_size(), F.layout(), F, F.Allocator);
 
   // First write "Test Str" into the source stream.
   MutableBinaryByteStream SourceStream(SrcData, little);
@@ -434,9 +446,9 @@ TEST(MappedBlockStreamTest, TestWriteDiscontiguousStreamRef) {
   DiscontiguousStream SrcF(SrcBlocks, SrcData);
 
   auto Dest = WritableMappedBlockStream::createStream(
-      DestF.block_size(), DestF.layout(), DestF);
+      DestF.block_size(), DestF.layout(), DestF, DestF.Allocator);
   auto Src = WritableMappedBlockStream::createStream(
-      SrcF.block_size(), SrcF.layout(), SrcF);
+      SrcF.block_size(), SrcF.layout(), SrcF, SrcF.Allocator);
 
   // First write "Test Str" into the source stream.
   BinaryStreamWriter SourceWriter(*Src);
@@ -457,4 +469,27 @@ TEST(MappedBlockStreamTest, TestWriteDiscontiguousStreamRef) {
   EXPECT_EQ(Result, "Test Str");
 }
 
+TEST(MappedBlockStreamTest, DataLivesAfterStreamDestruction) {
+  std::vector<uint8_t> DataBytes(10);
+  MutableArrayRef<uint8_t> Data(DataBytes);
+  const uint32_t Blocks[] = {2, 1, 0, 6, 3, 4, 5, 7, 9, 8};
+
+  StringRef Str[] = {"Zero Str", ""};
+
+  DiscontiguousStream F(Blocks, Data);
+  {
+    auto S = WritableMappedBlockStream::createStream(F.block_size(), F.layout(),
+                                                     F, F.Allocator);
+
+    BinaryStreamReader Reader(*S);
+    BinaryStreamWriter Writer(*S);
+    ::memset(DataBytes.data(), 0, 10);
+    EXPECT_NO_ERROR(Writer.writeCString(Str[0]));
+    EXPECT_NO_ERROR(Reader.readCString(Str[1]));
+    EXPECT_EQ(Str[0], Str[1]);
+  }
+
+  EXPECT_EQ(Str[0], Str[1]);
+}
+
 } // end anonymous namespace
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index d13547a842e4..db3d10847cd8 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -361,7 +361,7 @@ TEST_F(CloneFunc, NewFunctionCreated) {
 // Test that a new subprogram entry was added and is pointing to the new
 // function, while the original subprogram still points to the old one.
 TEST_F(CloneFunc, Subprogram) {
-  EXPECT_FALSE(verifyModule(*M));
+  EXPECT_FALSE(verifyModule(*M, &errs()));
   EXPECT_EQ(3U, Finder->subprogram_count());
   EXPECT_NE(NewFunc->getSubprogram(), OldFunc->getSubprogram());
 }
diff --git a/utils/TableGen/X86FoldTablesEmitter.cpp b/utils/TableGen/X86FoldTablesEmitter.cpp
index b89cee2ce4bb..34f5fbc6ea31 100644
--- a/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -101,6 +101,11 @@ const char *const NoFoldSet[] = {
     "BTS16rr", "BTS32rr", "BTS64rr",
     "BTS16mr", "BTS32mr", "BTS64mr",
 
+    // insertps cannot be folded without adjusting the immediate. There's custom
+    // code to handle it in X86InstrInfo.cpp, ignore it here.
+    "INSERTPSrr", "INSERTPSrm",
+    "VINSERTPSrr", "VINSERTPSrm", "VINSERTPSZrr", "VINSERTPSZrm",
+
     // Memory folding is enabled only when optimizing for size by DAG
     // patterns only. (issue detailed in D28744 review)
     "VCVTSS2SDrm",            "VCVTSS2SDrr",
diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
index 104e9dac464d..8991588a868d 100644
--- a/utils/lit/lit/util.py
+++ b/utils/lit/lit/util.py
@@ -267,6 +267,20 @@ def usePlatformSdkOnDarwin(config, lit_config):
             lit_config.note('using SDKROOT: %r' % sdk_path)
             config.environment['SDKROOT'] = sdk_path
 
+def findPlatformSdkVersionOnMacOS(config, lit_config):
+    if 'darwin' in config.target_triple:
+        try:
+            cmd = subprocess.Popen(['xcrun', '--show-sdk-version', '--sdk', 'macosx'],
+                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            out, err = cmd.communicate()
+            out = out.strip()
+            res = cmd.wait()
+        except OSError:
+            res = -1
+        if res == 0 and out:
+            return out
+    return None
+
 def killProcessAndChildren(pid):
     """
     This function kills a process with ``pid`` and all its