539 files changed, 21334 insertions, 8359 deletions
diff --git a/docs/CMake.rst b/docs/CMake.rst
index 0a32d3957a53..2deae9361874 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -186,8 +186,8 @@ CMake manual, or execute ``cmake --help-variable VARIABLE_NAME``.
   Sets the build type for ``make``-based generators. Possible values are
   Release, Debug, RelWithDebInfo and MinSizeRel. If you are using an IDE such as
   Visual Studio, you should use the IDE settings to set the build type.
-  Be aware that Release and RelWithDebInfo are not using the same optimization
-  level on most platform.
+  Be aware that Release and RelWithDebInfo use different optimization levels on
+  most platforms.
 
 **CMAKE_INSTALL_PREFIX**:PATH
   Path where LLVM will be installed if "make install" is invoked or the
diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst
index f39f1d9207a2..d13479dabca8 100644
--- a/docs/GetElementPtr.rst
+++ b/docs/GetElementPtr.rst
@@ -9,10 +9,11 @@ Introduction
 ============
 
 This document seeks to dispel the mystery and confusion surrounding LLVM's
-`GetElementPtr <LangRef.html#i_getelementptr>`_ (GEP) instruction.  Questions
-about the wily GEP instruction are probably the most frequently occurring
-questions once a developer gets down to coding with LLVM. Here we lay out the
-sources of confusion and show that the GEP instruction is really quite simple.
+`GetElementPtr <LangRef.html#getelementptr-instruction>`_ (GEP) instruction.
+Questions about the wily GEP instruction are probably the most frequently
+occurring questions once a developer gets down to coding with LLVM. Here we lay
+out the sources of confusion and show that the GEP instruction is really quite
+simple.
 
 Address Computation
 ===================
@@ -429,7 +430,8 @@ because LLVM has no restrictions on mixing types in addressing, loads or stores.
 
 LLVM's type-based alias analysis pass uses metadata to describe a different type
 system (such as the C type system), and performs type-based aliasing on top of
-that.  Further details are in the `language reference <LangRef.html#tbaa>`_.
+that.  Further details are in the
+`language reference <LangRef.html#tbaa-metadata>`_.
 
 What happens if a GEP computation overflows?
 --------------------------------------------
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 9ff47e8366dc..b205cae9b118 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -161,7 +161,7 @@ symbol table entries. Here is an example of the "hello world" module:
 
     ; Definition of main function
     define i32 @main() {   ; i32()*
-      ; Convert [13 x i8]* to i8  *...
+      ; Convert [13 x i8]* to i8*...
       %cast210 = getelementptr [13 x i8], [13 x i8]* @.str, i64 0, i64 0
 
       ; Call puts function to write out the string to stdout.
@@ -9548,7 +9548,7 @@ Syntax:
 
 ::
 
-      declare i8  *@llvm.returnaddress(i32 <level>)
+      declare i8* @llvm.returnaddress(i32 <level>)
 
 Overview:
 """""""""
@@ -9586,7 +9586,7 @@ Syntax:
 
 ::
 
-      declare i8  *@llvm.addressofreturnaddress()
+      declare i8* @llvm.addressofreturnaddress()
 
 Overview:
 """""""""
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 894e5571f8ad..fe75e25bd8d2 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -182,8 +182,9 @@ private:
   /// provides a more convenient form of divide for internal use since KnuthDiv
   /// has specific constraints on its inputs. If those constraints are not met
   /// then it provides a simpler form of divide.
-  static void divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
-                     unsigned rhsWords, APInt *Quotient, APInt *Remainder);
+  static void divide(const WordType *LHS, unsigned lhsWords,
+                     const WordType *RHS, unsigned rhsWords, WordType *Quotient,
+                     WordType *Remainder);
 
   /// out-of-line slow case for inline constructor
   void initSlowCase(uint64_t val, bool isSigned);
@@ -1016,11 +1017,13 @@ public:
   ///
   /// \returns a new APInt value containing the division result
   APInt udiv(const APInt &RHS) const;
+  APInt udiv(uint64_t RHS) const;
 
   /// \brief Signed division function for APInt.
   ///
   /// Signed divide this APInt by APInt RHS.
   APInt sdiv(const APInt &RHS) const;
+  APInt sdiv(int64_t RHS) const;
 
   /// \brief Unsigned remainder operation.
   ///
@@ -1032,11 +1035,13 @@ public:
   ///
   /// \returns a new APInt value containing the remainder result
   APInt urem(const APInt &RHS) const;
+  uint64_t urem(uint64_t RHS) const;
 
   /// \brief Function for signed remainder operation.
   ///
   /// Signed remainder operation on APInt.
   APInt srem(const APInt &RHS) const;
+  int64_t srem(int64_t RHS) const;
 
   /// \brief Dual division/remainder interface.
   ///
@@ -1047,9 +1052,13 @@ public:
   /// udivrem(X, Y, X, Y), for example.
   static void udivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient,
                       APInt &Remainder);
+  static void udivrem(const APInt &LHS, uint64_t RHS, APInt &Quotient,
+                      uint64_t &Remainder);
 
   static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient,
                       APInt &Remainder);
+  static void sdivrem(const APInt &LHS, int64_t RHS, APInt &Quotient,
+                      int64_t &Remainder);
 
   // Operations that return overflow indicators.
   APInt sadd_ov(const APInt &RHS, bool &Overflow) const;
@@ -2015,7 +2024,7 @@ inline APInt operator-(APInt a, const APInt &b) {
 }
 
 inline APInt operator-(const APInt &a, APInt &&b) {
-  b = -std::move(b);
+  b.negate();
   b += a;
   return std::move(b);
 }
@@ -2026,7 +2035,7 @@ inline APInt operator-(APInt a, uint64_t RHS) {
 }
 
 inline APInt operator-(uint64_t LHS, APInt b) {
-  b = -std::move(b);
+  b.negate();
   b += LHS;
   return b;
 }
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index b49d216e0b6e..a0b380b237da 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -365,6 +365,8 @@ protected:
 public:
   using iterator = SmallPtrSetIterator<PtrType>;
   using const_iterator = SmallPtrSetIterator<PtrType>;
+  using key_type = ConstPtrType;
+  using value_type = PtrType;
 
   SmallPtrSetImpl(const SmallPtrSetImpl &) = delete;
 
diff --git a/include/llvm/ADT/Statistic.h b/include/llvm/ADT/Statistic.h
index 53fa2a50fcba..d5ebba409c3d 100644
--- a/include/llvm/ADT/Statistic.h
+++ b/include/llvm/ADT/Statistic.h
@@ -101,6 +101,16 @@ public:
     return init();
   }
 
+  void updateMax(unsigned V) {
+    unsigned PrevMax = Value.load(std::memory_order_relaxed);
+    // Keep trying to update max until we succeed or another thread produces
+    // a bigger max than us.
+    while (V > PrevMax && !Value.compare_exchange_weak(
+                              PrevMax, V, std::memory_order_relaxed)) {
+    }
+    init();
+  }
+
 #else  // Statistics are disabled in release builds.
 
   const Statistic &operator=(unsigned Val) {
@@ -131,6 +141,8 @@ public:
     return *this;
   }
 
+  void updateMax(unsigned V) {}
+
 #endif  // !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
 
 protected:
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index e3a8a31ba9bc..3a4a37017d61 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -252,6 +252,10 @@ public:
            ObjectFormat == Other.ObjectFormat;
   }
 
+  bool operator!=(const Triple &Other) const {
+    return !(*this == Other);
+  }
+
   /// @}
   /// @name Normalization
   /// @{
@@ -722,6 +726,12 @@ public:
   /// \returns true if the triple is little endian, false otherwise.
   bool isLittleEndian() const;
 
+  /// Test whether target triples are compatible.
+  bool isCompatibleWith(const Triple &Other) const;
+
+  /// Merge target triples.
+  std::string merge(const Triple &Other) const;
+
   /// @}
   /// @name Static helpers for IDs.
   /// @{
diff --git a/include/llvm/Analysis/MemorySSA.h b/include/llvm/Analysis/MemorySSA.h
index db31ae9f4f10..f0bba8c4c020 100644
--- a/include/llvm/Analysis/MemorySSA.h
+++ b/include/llvm/Analysis/MemorySSA.h
@@ -84,6 +84,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedUser.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
@@ -127,7 +128,7 @@ using const_memoryaccess_def_iterator =
 // \brief The base for all memory accesses. All memory accesses in a block are
 // linked together using an intrusive list.
 class MemoryAccess
-    : public User,
+    : public DerivedUser,
       public ilist_node<MemoryAccess, ilist_tag<MSSAHelpers::AllAccessTag>>,
       public ilist_node<MemoryAccess, ilist_tag<MSSAHelpers::DefsOnlyTag>> {
 public:
@@ -145,15 +146,14 @@ public:
 
   MemoryAccess(const MemoryAccess &) = delete;
   MemoryAccess &operator=(const MemoryAccess &) = delete;
-  ~MemoryAccess() override;
 
   void *operator new(size_t, unsigned) = delete;
   void *operator new(size_t) = delete;
 
   BasicBlock *getBlock() const { return Block; }
 
-  virtual void print(raw_ostream &OS) const = 0;
-  virtual void dump() const;
+  void print(raw_ostream &OS) const;
+  void dump() const;
 
   /// \brief The user iterators for a memory access
   typedef user_iterator iterator;
@@ -207,11 +207,12 @@ protected:
 
   /// \brief Used for debugging and tracking things about MemoryAccesses.
   /// Guaranteed unique among MemoryAccesses, no guarantees otherwise.
-  virtual unsigned getID() const = 0;
+  inline unsigned getID() const;
 
-  MemoryAccess(LLVMContext &C, unsigned Vty, BasicBlock *BB,
-               unsigned NumOperands)
-      : User(Type::getVoidTy(C), Vty, nullptr, NumOperands), Block(BB) {}
+  MemoryAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue,
+               BasicBlock *BB, unsigned NumOperands)
+      : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue),
+        Block(BB) {}
 
 private:
   BasicBlock *Block;
@@ -248,21 +249,21 @@ public:
 
   // Sadly, these have to be public because they are needed in some of the
   // iterators.
-  virtual bool isOptimized() const = 0;
-  virtual MemoryAccess *getOptimized() const = 0;
-  virtual void setOptimized(MemoryAccess *) = 0;
+  inline bool isOptimized() const;
+  inline MemoryAccess *getOptimized() const;
+  inline void setOptimized(MemoryAccess *);
 
   /// \brief Reset the ID of what this MemoryUse was optimized to, causing it to
   /// be rewalked by the walker if necessary.
   /// This really should only be called by tests.
-  virtual void resetOptimized() = 0;
+  inline void resetOptimized();
 
 protected:
   friend class MemorySSA;
   friend class MemorySSAUpdater;
   MemoryUseOrDef(LLVMContext &C, MemoryAccess *DMA, unsigned Vty,
-                 Instruction *MI, BasicBlock *BB)
-      : MemoryAccess(C, Vty, BB, 1), MemoryInst(MI) {
+                 DeleteValueTy DeleteValue, Instruction *MI, BasicBlock *BB)
+      : MemoryAccess(C, Vty, DeleteValue, BB, 1), MemoryInst(MI) {
     setDefiningAccess(DMA);
   }
   void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false) {
@@ -292,7 +293,8 @@ public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
 
   MemoryUse(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB)
-      : MemoryUseOrDef(C, DMA, MemoryUseVal, MI, BB), OptimizedID(0) {}
+      : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB),
+        OptimizedID(0) {}
 
   // allocate space for exactly one operand
   void *operator new(size_t s) { return User::operator new(s, 1); }
@@ -302,32 +304,30 @@ public:
     return MA->getValueID() == MemoryUseVal;
   }
 
-  void print(raw_ostream &OS) const override;
+  void print(raw_ostream &OS) const;
 
-  virtual void setOptimized(MemoryAccess *DMA) override {
+  void setOptimized(MemoryAccess *DMA) {
     OptimizedID = DMA->getID();
     setOperand(0, DMA);
   }
 
-  virtual bool isOptimized() const override {
+  bool isOptimized() const {
     return getDefiningAccess() && OptimizedID == getDefiningAccess()->getID();
   }
 
-  virtual MemoryAccess *getOptimized() const override {
+  MemoryAccess *getOptimized() const {
     return getDefiningAccess();
   }
-  virtual void resetOptimized() override {
+  void resetOptimized() {
     OptimizedID = INVALID_MEMORYACCESS_ID;
   }
 
 protected:
   friend class MemorySSA;
 
-  unsigned getID() const override {
-    llvm_unreachable("MemoryUses do not have IDs");
-  }
-
 private:
+  static void deleteMe(DerivedUser *Self);
+
   unsigned int OptimizedID;
 };
 
@@ -350,8 +350,8 @@ public:
 
   MemoryDef(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB,
             unsigned Ver)
-      : MemoryUseOrDef(C, DMA, MemoryDefVal, MI, BB), ID(Ver),
-        Optimized(nullptr), OptimizedID(INVALID_MEMORYACCESS_ID) {}
+      : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB),
+        ID(Ver), Optimized(nullptr), OptimizedID(INVALID_MEMORYACCESS_ID) {}
 
   // allocate space for exactly one operand
   void *operator new(size_t s) { return User::operator new(s, 1); }
@@ -361,27 +361,28 @@ public:
     return MA->getValueID() == MemoryDefVal;
   }
 
-  virtual void setOptimized(MemoryAccess *MA) override {
+  void setOptimized(MemoryAccess *MA) {
     Optimized = MA;
     OptimizedID = getDefiningAccess()->getID();
   }
-  virtual MemoryAccess *getOptimized() const override { return Optimized; }
-  virtual bool isOptimized() const override {
+  MemoryAccess *getOptimized() const { return Optimized; }
+  bool isOptimized() const {
     return getOptimized() && getDefiningAccess() &&
            OptimizedID == getDefiningAccess()->getID();
   }
-  virtual void resetOptimized() override {
+  void resetOptimized() {
     OptimizedID = INVALID_MEMORYACCESS_ID;
   }
 
-  void print(raw_ostream &OS) const override;
+  void print(raw_ostream &OS) const;
 
-protected:
   friend class MemorySSA;
 
-  unsigned getID() const override { return ID; }
+  unsigned getID() const { return ID; }
 
 private:
+  static void deleteMe(DerivedUser *Self);
+
   const unsigned ID;
   MemoryAccess *Optimized;
   unsigned int OptimizedID;
@@ -432,7 +433,8 @@ public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
 
   MemoryPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0)
-      : MemoryAccess(C, MemoryPhiVal, BB, 0), ID(Ver), ReservedSpace(NumPreds) {
+      : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, 0), ID(Ver),
+        ReservedSpace(NumPreds) {
     allocHungoffUses(ReservedSpace);
   }
 
@@ -534,7 +536,9 @@ public:
     return V->getValueID() == MemoryPhiVal;
   }
 
-  void print(raw_ostream &OS) const override;
+  void print(raw_ostream &OS) const;
+
+  unsigned getID() const { return ID; }
 
 protected:
   friend class MemorySSA;
@@ -546,8 +550,6 @@ protected:
     User::allocHungoffUses(N, /* IsPhi */ true);
   }
 
-  unsigned getID() const final { return ID; }
-
 private:
   // For debugging only
   const unsigned ID;
@@ -561,8 +563,45 @@ private:
     ReservedSpace = std::max(E + E / 2, 2u);
     growHungoffUses(ReservedSpace, /* IsPhi */ true);
   }
+
+  static void deleteMe(DerivedUser *Self);
 };
 
+inline unsigned MemoryAccess::getID() const {
+  assert((isa<MemoryDef>(this) || isa<MemoryPhi>(this)) &&
+         "only memory defs and phis have ids");
+  if (const auto *MD = dyn_cast<MemoryDef>(this))
+    return MD->getID();
+  return cast<MemoryPhi>(this)->getID();
+}
+
+inline bool MemoryUseOrDef::isOptimized() const {
+  if (const auto *MD = dyn_cast<MemoryDef>(this))
+    return MD->isOptimized();
+  return cast<MemoryUse>(this)->isOptimized();
+}
+
+inline MemoryAccess *MemoryUseOrDef::getOptimized() const {
+  if (const auto *MD = dyn_cast<MemoryDef>(this))
+    return MD->getOptimized();
+  return cast<MemoryUse>(this)->getOptimized();
+}
+
+inline void MemoryUseOrDef::setOptimized(MemoryAccess *MA) {
+  if (auto *MD = dyn_cast<MemoryDef>(this))
+    MD->setOptimized(MA);
+  else
+    cast<MemoryUse>(this)->setOptimized(MA);
+}
+
+inline void MemoryUseOrDef::resetOptimized() {
+  if (auto *MD = dyn_cast<MemoryDef>(this))
+    MD->resetOptimized();
+  else
+    cast<MemoryUse>(this)->resetOptimized();
+}
+
+
 template <> struct OperandTraits<MemoryPhi> : public HungoffOperandTraits<2> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
 
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index ceca6cb389a1..ac54bd4cfffb 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -656,10 +656,12 @@ private:
     /// Test whether this BackedgeTakenInfo contains complete information.
     bool hasFullInfo() const { return isComplete(); }
 
-    /// Return an expression indicating the exact backedge-taken count of the
-    /// loop if it is known or SCEVCouldNotCompute otherwise. This is the
-    /// number of times the loop header can be guaranteed to execute, minus
-    /// one.
+    /// Return an expression indicating the exact *backedge-taken*
+    /// count of the loop if it is known or SCEVCouldNotCompute
+    /// otherwise.  If execution makes it to the backedge on every
+    /// iteration (i.e. there are no abnormal exists like exception
+    /// throws and thread exits) then this is the number of times the
+    /// loop header will execute minus one.
     ///
     /// If the SCEV predicate associated with the answer can be different
     /// from AlwaysTrue, we must add a (non null) Predicates argument.
@@ -1398,11 +1400,11 @@ public:
   const SCEV *getExitCount(const Loop *L, BasicBlock *ExitingBlock);
 
   /// If the specified loop has a predictable backedge-taken count, return it,
-  /// otherwise return a SCEVCouldNotCompute object. The backedge-taken count
-  /// is the number of times the loop header will be branched to from within
-  /// the loop. This is one less than the trip count of the loop, since it
-  /// doesn't count the first iteration, when the header is branched to from
-  /// outside the loop.
+  /// otherwise return a SCEVCouldNotCompute object. The backedge-taken count is
+  /// the number of times the loop header will be branched to from within the
+  /// loop, assuming there are no abnormal exists like exception throws. This is
+  /// one less than the trip count of the loop, since it doesn't count the first
+  /// iteration, when the header is branched to from outside the loop.
   ///
   /// Note that it is not valid to call this method on a loop without a
   /// loop-invariant backedge-taken count (see
@@ -1417,8 +1419,10 @@ public:
   const SCEV *getPredicatedBackedgeTakenCount(const Loop *L,
                                               SCEVUnionPredicate &Predicates);
 
-  /// Similar to getBackedgeTakenCount, except return the least SCEV value
-  /// that is known never to be less than the actual backedge taken count.
+  /// When successful, this returns a SCEVConstant that is greater than or equal
+  /// to (i.e. a "conservative over-approximation") of the value returend by
+  /// getBackedgeTakenCount.  If such a value cannot be computed, it returns the
+  /// SCEVCouldNotCompute object.
   const SCEV *getMaxBackedgeTakenCount(const Loop *L);
 
   /// Return true if the backedge taken count is either the value returned by
diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
index 944250cfd6ac..0e3bdaa11c9a 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -191,6 +191,14 @@ public:
   void setShouldSignExtI32Param(bool Val) {
     ShouldSignExtI32Param = Val;
   }
+
+  /// Returns the size of the wchar_t type in bytes.
+  unsigned getWCharSize(const Module &M) const;
+
+  /// Returns size of the default wchar_t type on target \p T. This is mostly
+  /// intended to verify that the size in the frontend matches LLVM. All other
+  /// queries should use getWCharSize() instead.
+  static unsigned getTargetWCharSize(const Triple &T);
 };
 
 /// Provides information about what library functions are available for
@@ -307,6 +315,11 @@ public:
     return Attribute::None;
   }
 
+  /// \copydoc TargetLibraryInfoImpl::getWCharSize()
+  unsigned getWCharSize(const Module &M) const {
+    return Impl->getWCharSize(M);
+  }
+
   /// Handle invalidation from the pass manager.
   ///
   /// If we try to invalidate this info, just return false. It cannot become
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index f5f323c6b797..cf24062e46f8 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -218,9 +218,38 @@ template <typename T> class ArrayRef;
                                             DL);
   }
 
-  /// Returns true if the GEP is based on a pointer to a string (array of i8), 
-  /// and is indexing into this string.
-  bool isGEPBasedOnPointerToString(const GEPOperator *GEP);
+  /// Returns true if the GEP is based on a pointer to a string (array of
+  // \p CharSize integers) and is indexing into this string.
+  bool isGEPBasedOnPointerToString(const GEPOperator *GEP,
+                                   unsigned CharSize = 8);
+
+  /// Represents offset+length into a ConstantDataArray.
+  struct ConstantDataArraySlice {
+    /// ConstantDataArray pointer. nullptr indicates a zeroinitializer (a valid
+    /// initializer, it just doesn't fit the ConstantDataArray interface).
+    const ConstantDataArray *Array;
+    /// Slice starts at this Offset.
+    uint64_t Offset;
+    /// Length of the slice.
+    uint64_t Length;
+
+    /// Moves the Offset and adjusts Length accordingly.
+    void move(uint64_t Delta) {
+      assert(Delta < Length);
+      Offset += Delta;
+      Length -= Delta;
+    }
+    /// Convenience accessor for elements in the slice.
+    uint64_t operator[](unsigned I) const {
+      return Array==nullptr ? 0 : Array->getElementAsInteger(I + Offset);
+    }
+  };
+
+  /// Returns true if the value \p V is a pointer into a ContantDataArray.
+  /// If successfull \p Index will point to a ConstantDataArray info object
+  /// with an apropriate offset.
+  bool getConstantDataArrayInfo(const Value *V, ConstantDataArraySlice &Slice,
+                                unsigned ElementSize, uint64_t Offset = 0);
 
   /// This function computes the length of a null-terminated C string pointed to
   /// by V. If successful, it returns true and returns the string in Str. If
@@ -233,7 +262,7 @@ template <typename T> class ArrayRef;
 
   /// If we can compute the length of the string pointed to by the specified
   /// pointer, return 'len+1'.  If we can't, return 0.
-  uint64_t GetStringLength(const Value *V);
+  uint64_t GetStringLength(const Value *V, unsigned CharSize = 8);
 
   /// This method strips off any GEP address adjustments and pointer casts from
   /// the specified value, returning the original object being addressed. Note
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index f5b1f87720ad..181cb375de86 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -189,7 +189,7 @@ extern cl::opt<bool> UseSegmentSetForPhysRegs;
     void pruneValue(LiveRange &LR, SlotIndex Kill,
                     SmallVectorImpl<SlotIndex> *EndPoints);
 
-    /// This function should be used. Its intend is to tell you that
+    /// This function should not be used. Its intend is to tell you that
     /// you are doing something wrong if you call pruveValue directly on a
     /// LiveInterval. Indeed, you are supposed to call pruneValue on the main
     /// LiveRange and all the LiveRange of the subranges if any.
diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h
index a90fe96227b9..e92bb7f74967 100644
--- a/include/llvm/CodeGen/MachineValueType.h
+++ b/include/llvm/CodeGen/MachineValueType.h
@@ -56,117 +56,119 @@ class MVT {
       FIRST_FP_VALUETYPE = f16,
       LAST_FP_VALUETYPE  = ppcf128,
 
-      v2i1           =  14,   //    2 x i1
-      v4i1           =  15,   //    4 x i1
-      v8i1           =  16,   //    8 x i1
-      v16i1          =  17,   //   16 x i1
-      v32i1          =  18,   //   32 x i1
-      v64i1          =  19,   //   64 x i1
-      v512i1         =  20,   //  512 x i1
-      v1024i1        =  21,   // 1024 x i1
-
-      v1i8           =  22,   //  1 x i8
-      v2i8           =  23,   //  2 x i8
-      v4i8           =  24,   //  4 x i8
-      v8i8           =  25,   //  8 x i8
-      v16i8          =  26,   // 16 x i8
-      v32i8          =  27,   // 32 x i8
-      v64i8          =  28,   // 64 x i8
-      v128i8         =  29,   //128 x i8
-      v256i8         =  30,   //256 x i8
-
-      v1i16          =  31,   //  1 x i16
-      v2i16          =  32,   //  2 x i16
-      v4i16          =  33,   //  4 x i16
-      v8i16          =  34,   //  8 x i16
-      v16i16         =  35,   // 16 x i16
-      v32i16         =  36,   // 32 x i16
-      v64i16         =  37,   // 64 x i16
-      v128i16        =  38,   //128 x i16
-
-      v1i32          =  39,   //  1 x i32
-      v2i32          =  40,   //  2 x i32
-      v4i32          =  41,   //  4 x i32
-      v8i32          =  42,   //  8 x i32
-      v16i32         =  43,   // 16 x i32
-      v32i32         =  44,   // 32 x i32
-      v64i32         =  45,   // 64 x i32
-
-      v1i64          =  46,   //  1 x i64
-      v2i64          =  47,   //  2 x i64
-      v4i64          =  48,   //  4 x i64
-      v8i64          =  49,   //  8 x i64
-      v16i64         =  50,   // 16 x i64
-      v32i64         =  51,   // 32 x i64
-
-      v1i128         =  52,   //  1 x i128
+      v1i1           =  14,   //    1 x i1
+      v2i1           =  15,   //    2 x i1
+      v4i1           =  16,   //    4 x i1
+      v8i1           =  17,   //    8 x i1
+      v16i1          =  18,   //   16 x i1
+      v32i1          =  19,   //   32 x i1
+      v64i1          =  20,   //   64 x i1
+      v512i1         =  21,   //  512 x i1
+      v1024i1        =  22,   // 1024 x i1
+
+      v1i8           =  23,   //  1 x i8
+      v2i8           =  24,   //  2 x i8
+      v4i8           =  25,   //  4 x i8
+      v8i8           =  26,   //  8 x i8
+      v16i8          =  27,   // 16 x i8
+      v32i8          =  28,   // 32 x i8
+      v64i8          =  29,   // 64 x i8
+      v128i8         =  30,   //128 x i8
+      v256i8         =  31,   //256 x i8
+
+      v1i16          =  32,   //  1 x i16
+      v2i16          =  33,   //  2 x i16
+      v4i16          =  34,   //  4 x i16
+      v8i16          =  35,   //  8 x i16
+      v16i16         =  36,   // 16 x i16
+      v32i16         =  37,   // 32 x i16
+      v64i16         =  38,   // 64 x i16
+      v128i16        =  39,   //128 x i16
+
+      v1i32          =  40,   //  1 x i32
+      v2i32          =  41,   //  2 x i32
+      v4i32          =  42,   //  4 x i32
+      v8i32          =  43,   //  8 x i32
+      v16i32         =  44,   // 16 x i32
+      v32i32         =  45,   // 32 x i32
+      v64i32         =  46,   // 64 x i32
+
+      v1i64          =  47,   //  1 x i64
+      v2i64          =  48,   //  2 x i64
+      v4i64          =  49,   //  4 x i64
+      v8i64          =  50,   //  8 x i64
+      v16i64         =  51,   // 16 x i64
+      v32i64         =  52,   // 32 x i64
+
+      v1i128         =  53,   //  1 x i128
 
       // Scalable integer types
-      nxv2i1         =  53,   // n x  2 x i1
-      nxv4i1         =  54,   // n x  4 x i1
-      nxv8i1         =  55,   // n x  8 x i1
-      nxv16i1        =  56,   // n x 16 x i1
-      nxv32i1        =  57,   // n x 32 x i1
-
-      nxv1i8         =  58,   // n x  1 x i8
-      nxv2i8         =  59,   // n x  2 x i8
-      nxv4i8         =  60,   // n x  4 x i8
-      nxv8i8         =  61,   // n x  8 x i8
-      nxv16i8        =  62,   // n x 16 x i8
-      nxv32i8        =  63,   // n x 32 x i8
-
-      nxv1i16        =  64,   // n x  1 x i16
-      nxv2i16        =  65,   // n x  2 x i16
-      nxv4i16        =  66,   // n x  4 x i16
-      nxv8i16        =  67,   // n x  8 x i16
-      nxv16i16       =  68,   // n x 16 x i16
-      nxv32i16       =  69,   // n x 32 x i16
-
-      nxv1i32        =  70,   // n x  1 x i32
-      nxv2i32        =  71,   // n x  2 x i32
-      nxv4i32        =  72,   // n x  4 x i32
-      nxv8i32        =  73,   // n x  8 x i32
-      nxv16i32       =  74,   // n x 16 x i32
-      nxv32i32       =  75,   // n x 32 x i32
-
-      nxv1i64        =  76,   // n x  1 x i64
-      nxv2i64        =  77,   // n x  2 x i64
-      nxv4i64        =  78,   // n x  4 x i64
-      nxv8i64        =  79,   // n x  8 x i64
-      nxv16i64       =  80,   // n x 16 x i64
-      nxv32i64       =  81,   // n x 32 x i64
-
-      FIRST_INTEGER_VECTOR_VALUETYPE = v2i1,
+      nxv1i1         =  54,   // n x  1 x i1
+      nxv2i1         =  55,   // n x  2 x i1
+      nxv4i1         =  56,   // n x  4 x i1
+      nxv8i1         =  57,   // n x  8 x i1
+      nxv16i1        =  58,   // n x 16 x i1
+      nxv32i1        =  59,   // n x 32 x i1
+
+      nxv1i8         =  60,   // n x  1 x i8
+      nxv2i8         =  61,   // n x  2 x i8
+      nxv4i8         =  62,   // n x  4 x i8
+      nxv8i8         =  63,   // n x  8 x i8
+      nxv16i8        =  64,   // n x 16 x i8
+      nxv32i8        =  65,   // n x 32 x i8
+
+      nxv1i16        =  66,   // n x  1 x i16
+      nxv2i16        =  67,   // n x  2 x i16
+      nxv4i16        =  68,   // n x  4 x i16
+      nxv8i16        =  69,   // n x  8 x i16
+      nxv16i16       =  70,   // n x 16 x i16
+      nxv32i16       =  71,   // n x 32 x i16
+
+      nxv1i32        =  72,   // n x  1 x i32
+      nxv2i32        =  73,   // n x  2 x i32
+      nxv4i32        =  74,   // n x  4 x i32
+      nxv8i32        =  75,   // n x  8 x i32
+      nxv16i32       =  76,   // n x 16 x i32
+      nxv32i32       =  77,   // n x 32 x i32
+
+      nxv1i64        =  78,   // n x  1 x i64
+      nxv2i64        =  79,   // n x  2 x i64
+      nxv4i64        =  80,   // n x  4 x i64
+      nxv8i64        =  81,   // n x  8 x i64
+      nxv16i64       =  82,   // n x 16 x i64
+      nxv32i64       =  83,   // n x 32 x i64
+
+      FIRST_INTEGER_VECTOR_VALUETYPE = v1i1,
       LAST_INTEGER_VECTOR_VALUETYPE = nxv32i64,
 
-      FIRST_INTEGER_SCALABLE_VALUETYPE = nxv2i1,
+      FIRST_INTEGER_SCALABLE_VALUETYPE = nxv1i1,
       LAST_INTEGER_SCALABLE_VALUETYPE = nxv32i64,
 
-      v2f16          =  82,   //  2 x f16
-      v4f16          =  83,   //  4 x f16
-      v8f16          =  84,   //  8 x f16
-      v1f32          =  85,   //  1 x f32
-      v2f32          =  86,   //  2 x f32
-      v4f32          =  87,   //  4 x f32
-      v8f32          =  88,   //  8 x f32
-      v16f32         =  89,   // 16 x f32
-      v1f64          =  90,   //  1 x f64
-      v2f64          =  91,   //  2 x f64
-      v4f64          =  92,   //  4 x f64
-      v8f64          =  93,   //  8 x f64
-
-      nxv2f16        =  94,   // n x  2 x f16
-      nxv4f16        =  95,   // n x  4 x f16
-      nxv8f16        =  96,   // n x  8 x f16
-      nxv1f32        =  97,   // n x  1 x f32
-      nxv2f32        =  98,   // n x  2 x f32
-      nxv4f32        =  99,   // n x  4 x f32
-      nxv8f32        = 100,   // n x  8 x f32
-      nxv16f32       = 101,   // n x 16 x f32
-      nxv1f64        = 102,   // n x  1 x f64
-      nxv2f64        = 103,   // n x  2 x f64
-      nxv4f64        = 104,   // n x  4 x f64
-      nxv8f64        = 105,   // n x  8 x f64
+      v2f16          =  84,   //  2 x f16
+      v4f16          =  85,   //  4 x f16
+      v8f16          =  86,   //  8 x f16
+      v1f32          =  87,   //  1 x f32
+      v2f32          =  88,   //  2 x f32
+      v4f32          =  89,   //  4 x f32
+      v8f32          =  90,   //  8 x f32
+      v16f32         =  91,   // 16 x f32
+      v1f64          =  92,   //  1 x f64
+      v2f64          =  93,   //  2 x f64
+      v4f64          =  94,   //  4 x f64
+      v8f64          =  95,   //  8 x f64
+
+      nxv2f16        =  96,   // n x  2 x f16
+      nxv4f16        =  97,   // n x  4 x f16
+      nxv8f16        =  98,   // n x  8 x f16
+      nxv1f32        =  99,   // n x  1 x f32
+      nxv2f32        = 100,   // n x  2 x f32
+      nxv4f32        = 101,   // n x  4 x f32
+      nxv8f32        = 102,   // n x  8 x f32
+      nxv16f32       = 103,   // n x 16 x f32
+      nxv1f64        = 104,   // n x  1 x f64
+      nxv2f64        = 105,   // n x  2 x f64
+      nxv4f64        = 106,   // n x  4 x f64
+      nxv8f64        = 107,   // n x  8 x f64
 
       FIRST_FP_VECTOR_VALUETYPE = v2f16,
       LAST_FP_VECTOR_VALUETYPE = nxv8f64,
@@ -174,21 +176,21 @@ class MVT {
       FIRST_FP_SCALABLE_VALUETYPE = nxv2f16,
       LAST_FP_SCALABLE_VALUETYPE = nxv8f64,
 
-      FIRST_VECTOR_VALUETYPE = v2i1,
+      FIRST_VECTOR_VALUETYPE = v1i1,
       LAST_VECTOR_VALUETYPE  = nxv8f64,
 
-      x86mmx         =  106,   // This is an X86 MMX value
+      x86mmx         =  108,   // This is an X86 MMX value
 
-      Glue           =  107,   // This glues nodes together during pre-RA sched
+      Glue           =  109,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  108,   // This has no value
+      isVoid         =  110,   // This has no value
 
-      Untyped        =  109,   // This value takes a register, but has
+      Untyped        =  111,   // This value takes a register, but has
                                // unspecified type.  The register class
                                // will be determined by the opcode.
 
       FIRST_VALUETYPE = 1,     // This is always the beginning of the list.
-      LAST_VALUETYPE =  110,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  112,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -411,6 +413,7 @@ class MVT {
       switch (SimpleTy) {
       default:
         llvm_unreachable("Not a vector MVT!");
+      case v1i1:
       case v2i1:
       case v4i1:
       case v8i1:
@@ -419,6 +422,7 @@ class MVT {
       case v64i1:
       case v512i1:
       case v1024i1:
+      case nxv1i1:
       case nxv2i1:
       case nxv4i1:
       case nxv8i1:
@@ -589,6 +593,7 @@ class MVT {
       case nxv2f16:
       case nxv2f32:
       case nxv2f64: return 2;
+      case v1i1:
       case v1i8:
       case v1i16:
       case v1i32:
@@ -596,6 +601,7 @@ class MVT {
       case v1i128:
       case v1f32:
       case v1f64:
+      case nxv1i1:
       case nxv1i8:
       case nxv1i16:
       case nxv1i32:
@@ -628,7 +634,9 @@ class MVT {
                          "in codegen and has no size");
       case Metadata:
         llvm_unreachable("Value type is metadata.");
-      case i1  :  return 1;
+      case i1:
+      case v1i1:
+      case nxv1i1: return 1;
       case v2i1:
       case nxv2i1: return 2;
       case v4i1:
@@ -814,6 +822,7 @@ class MVT {
       default:
         break;
       case MVT::i1:
+        if (NumElements == 1)    return MVT::v1i1;
         if (NumElements == 2)    return MVT::v2i1;
         if (NumElements == 4)    return MVT::v4i1;
         if (NumElements == 8)    return MVT::v8i1;
@@ -891,6 +900,7 @@ class MVT {
         default:
           break;
         case MVT::i1:
+          if (NumElements == 1)  return MVT::nxv1i1;
           if (NumElements == 2)  return MVT::nxv2i1;
           if (NumElements == 4)  return MVT::nxv4i1;
           if (NumElements == 8)  return MVT::nxv8i1;
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 8a5a1997386f..f3e04cffcda6 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -33,7 +33,7 @@ class raw_ostream;
 
 /// List of target independent CodeGen pass IDs.
 namespace llvm {
-  FunctionPass *createAtomicExpandPass(const TargetMachine *TM);
+  FunctionPass *createAtomicExpandPass();
 
   /// createUnreachableBlockEliminationPass - The LLVM code generator does not
   /// work well with unreachable basic blocks (what live ranges make sense for a
@@ -66,7 +66,7 @@ namespace llvm {
 
   /// createCodeGenPreparePass - Transform the code to expose more pattern
   /// matching during instruction selection.
-  FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
+  FunctionPass *createCodeGenPreparePass();
 
   /// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
   /// and scatter intrinsics with scalar code when target doesn't support them.
@@ -133,10 +133,6 @@ namespace llvm {
   // instruction and update the MachineFunctionInfo with that information.
   extern char &ShrinkWrapID;
 
-  /// LiveRangeShrink pass. Move instruction close to its definition to shrink
-  /// the definition's live range.
-  extern char &LiveRangeShrinkID;
-
   /// Greedy register allocator.
   extern char &RAGreedyID;
 
@@ -177,7 +173,7 @@ namespace llvm {
   /// PrologEpilogCodeInserter - This pass inserts prolog and epilog code,
   /// and eliminates abstract frame references.
   extern char &PrologEpilogCodeInserterID;
-  MachineFunctionPass *createPrologEpilogInserterPass(const TargetMachine *TM);
+  MachineFunctionPass *createPrologEpilogInserterPass();
 
   /// ExpandPostRAPseudos - This pass expands pseudo instructions after
   /// register allocation.
@@ -305,7 +301,7 @@ namespace llvm {
 
   /// createStackProtectorPass - This pass adds stack protectors to functions.
   ///
-  FunctionPass *createStackProtectorPass(const TargetMachine *TM);
+  FunctionPass *createStackProtectorPass();
 
   /// createMachineVerifierPass - This pass verifies cenerated machine code
   /// instructions for correctness.
@@ -314,11 +310,11 @@ namespace llvm {
 
   /// createDwarfEHPass - This pass mulches exception handling code into a form
   /// adapted to code generation.  Required if using dwarf exception handling.
-  FunctionPass *createDwarfEHPass(const TargetMachine *TM);
+  FunctionPass *createDwarfEHPass();
 
   /// createWinEHPass - Prepares personality functions used by MSVC on Windows,
   /// in addition to the Itanium LSDA based personalities.
-  FunctionPass *createWinEHPass(const TargetMachine *TM);
+  FunctionPass *createWinEHPass();
 
   /// createSjLjEHPreparePass - This pass adapts exception handling code to use
   /// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow.
@@ -362,12 +358,12 @@ namespace llvm {
   /// InterleavedAccess Pass - This pass identifies and matches interleaved
   /// memory accesses to target specific intrinsics.
   ///
-  FunctionPass *createInterleavedAccessPass(const TargetMachine *TM);
+  FunctionPass *createInterleavedAccessPass();
 
   /// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all
   /// TLS variables for the emulated TLS model.
   ///
-  ModulePass *createLowerEmuTLSPass(const TargetMachine *TM);
+  ModulePass *createLowerEmuTLSPass();
 
   /// This pass lowers the @llvm.load.relative intrinsic to instructions.
   /// This is unsafe to do earlier because a pass may combine the constant
@@ -384,7 +380,7 @@ namespace llvm {
 
   /// This pass splits the stack into a safe stack and an unsafe stack to
   /// protect against stack-based overflow vulnerabilities.
-  FunctionPass *createSafeStackPass(const TargetMachine *TM = nullptr);
+  FunctionPass *createSafeStackPass();
 
   /// This pass detects subregister lanes in a virtual register that are used
   /// independently of other lanes and splits them into separate virtual
@@ -419,33 +415,4 @@ namespace llvm {
 
 } // End llvm namespace
 
-/// Target machine pass initializer for passes with dependencies. Use with
-/// INITIALIZE_TM_PASS_END.
-#define INITIALIZE_TM_PASS_BEGIN INITIALIZE_PASS_BEGIN
-
-/// Target machine pass initializer for passes with dependencies. Use with
-/// INITIALIZE_TM_PASS_BEGIN.
-#define INITIALIZE_TM_PASS_END(passName, arg, name, cfg, analysis)             \
-  PassInfo *PI = new PassInfo(                                                 \
-      name, arg, &passName::ID,                                                \
-      PassInfo::NormalCtor_t(callDefaultCtor<passName>), cfg, analysis,        \
-      PassInfo::TargetMachineCtor_t(callTargetMachineCtor<passName>));         \
-  Registry.registerPass(*PI, true);                                            \
-  return PI;                                                                   \
-  }                                                                            \
-  static llvm::once_flag Initialize##passName##PassFlag;                       \
-  void llvm::initialize##passName##Pass(PassRegistry &Registry) {              \
-    llvm::call_once(Initialize##passName##PassFlag,                            \
-                    initialize##passName##PassOnce, std::ref(Registry));       \
-  }
-
-/// This initializer registers TargetMachine constructor, so the pass being
-/// initialized can use target dependent interfaces. Please do not move this
-/// macro to be together with INITIALIZE_PASS, which is a complete target
-/// independent initializer, and we don't want to make libScalarOpts depend
-/// on libCodeGen.
-#define INITIALIZE_TM_PASS(passName, arg, name, cfg, analysis)                 \
-  INITIALIZE_TM_PASS_BEGIN(passName, arg, name, cfg, analysis)                 \
-  INITIALIZE_TM_PASS_END(passName, arg, name, cfg, analysis)
-
 #endif
diff --git a/include/llvm/CodeGen/StackProtector.h b/include/llvm/CodeGen/StackProtector.h
index 0655f19a323e..b970de71f862 100644
--- a/include/llvm/CodeGen/StackProtector.h
+++ b/include/llvm/CodeGen/StackProtector.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
@@ -55,7 +56,7 @@ private:
   /// TLI - Keep a pointer of a TargetLowering to consult for determining
   /// target type sizes.
   const TargetLoweringBase *TLI = nullptr;
-  const Triple Trip;
+  Triple Trip;
 
   Function *F;
   Module *M;
@@ -114,17 +115,12 @@ private:
 public:
   static char ID; // Pass identification, replacement for typeid.
 
-  StackProtector() : FunctionPass(ID) {
-    initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-  }
-
-  StackProtector(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), Trip(TM->getTargetTriple()),
-        SSPBufferSize(8) {
+  StackProtector() : FunctionPass(ID), SSPBufferSize(8) {
     initializeStackProtectorPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
     AU.addPreserved<DominatorTreeWrapperPass>();
   }
 
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index b87a5e56699e..b1e62daa5aae 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -33,115 +33,117 @@ def f80    : ValueType<80 , 11>;   // 80-bit floating point value
 def f128   : ValueType<128, 12>;   // 128-bit floating point value
 def ppcf128: ValueType<128, 13>;   // PPC 128-bit floating point value
 
-def v2i1   : ValueType<2 ,  14>;   //   2 x i1 vector value
-def v4i1   : ValueType<4 ,  15>;   //   4 x i1 vector value
-def v8i1   : ValueType<8 ,  16>;   //   8 x i1 vector value
-def v16i1  : ValueType<16,  17>;   //  16 x i1 vector value
-def v32i1  : ValueType<32 , 18>;   //  32 x i1 vector value
-def v64i1  : ValueType<64 , 19>;   //  64 x i1 vector value
-def v512i1 : ValueType<512, 20>;   // 512 x i1 vector value
-def v1024i1: ValueType<1024,21>;   //1024 x i1 vector value
-
-def v1i8   : ValueType<8,   22>;   //  1 x i8  vector value
-def v2i8   : ValueType<16 , 23>;   //  2 x i8  vector value
-def v4i8   : ValueType<32 , 24>;   //  4 x i8  vector value
-def v8i8   : ValueType<64 , 25>;   //  8 x i8  vector value
-def v16i8  : ValueType<128, 26>;   // 16 x i8  vector value
-def v32i8  : ValueType<256, 27>;   // 32 x i8  vector value
-def v64i8  : ValueType<512, 28>;   // 64 x i8  vector value
-def v128i8 : ValueType<1024,29>;   //128 x i8  vector value
-def v256i8 : ValueType<2048,30>;   //256 x i8  vector value
-
-def v1i16  : ValueType<16 , 31>;   //  1 x i16 vector value
-def v2i16  : ValueType<32 , 32>;   //  2 x i16 vector value
-def v4i16  : ValueType<64 , 33>;   //  4 x i16 vector value
-def v8i16  : ValueType<128, 34>;   //  8 x i16 vector value
-def v16i16 : ValueType<256, 35>;   // 16 x i16 vector value
-def v32i16 : ValueType<512, 36>;   // 32 x i16 vector value
-def v64i16 : ValueType<1024,37>;   // 64 x i16 vector value
-def v128i16: ValueType<2048,38>;   //128 x i16 vector value
-
-def v1i32  : ValueType<32 , 39>;   //  1 x i32 vector value
-def v2i32  : ValueType<64 , 40>;   //  2 x i32 vector value
-def v4i32  : ValueType<128, 41>;   //  4 x i32 vector value
-def v8i32  : ValueType<256, 42>;   //  8 x i32 vector value
-def v16i32 : ValueType<512, 43>;   // 16 x i32 vector value
-def v32i32 : ValueType<1024,44>;   // 32 x i32 vector value
-def v64i32 : ValueType<2048,45>;   // 32 x i32 vector value
-
-def v1i64  : ValueType<64 , 46>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 47>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 48>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 49>;   //  8 x i64 vector value
-def v16i64 : ValueType<1024,50>;   // 16 x i64 vector value
-def v32i64 : ValueType<2048,51>;   // 32 x i64 vector value
-
-def v1i128 : ValueType<128, 52>;   //  1 x i128 vector value
-
-def nxv2i1  : ValueType<2,   53>;  // n x  2 x i1  vector value
-def nxv4i1  : ValueType<4,   54>;  // n x  4 x i1  vector value
-def nxv8i1  : ValueType<8,   55>;  // n x  8 x i1  vector value
-def nxv16i1 : ValueType<16,  56>;  // n x 16 x i1  vector value
-def nxv32i1 : ValueType<32,  57>;  // n x 32 x i1  vector value
-
-def nxv1i8  : ValueType<8,   58>;  // n x  1 x i8  vector value
-def nxv2i8  : ValueType<16,  59>;  // n x  2 x i8  vector value
-def nxv4i8  : ValueType<32,  60>;  // n x  4 x i8  vector value
-def nxv8i8  : ValueType<64,  61>;  // n x  8 x i8  vector value
-def nxv16i8 : ValueType<128, 62>;  // n x 16 x i8  vector value
-def nxv32i8 : ValueType<256, 63>;  // n x 32 x i8  vector value
-
-def nxv1i16 : ValueType<16,  64>;  // n x  1 x i16 vector value
-def nxv2i16 : ValueType<32,  65>;  // n x  2 x i16 vector value
-def nxv4i16 : ValueType<64,  66>;  // n x  4 x i16 vector value
-def nxv8i16 : ValueType<128, 67>;  // n x  8 x i16 vector value
-def nxv16i16: ValueType<256, 68>;  // n x 16 x i16 vector value
-def nxv32i16: ValueType<512, 69>;  // n x 32 x i16 vector value
-
-def nxv1i32 : ValueType<32,  70>;  // n x  1 x i32 vector value
-def nxv2i32 : ValueType<64,  71>;  // n x  2 x i32 vector value
-def nxv4i32 : ValueType<128, 72>;  // n x  4 x i32 vector value
-def nxv8i32 : ValueType<256, 73>;  // n x  8 x i32 vector value
-def nxv16i32: ValueType<512, 74>;  // n x 16 x i32 vector value
-def nxv32i32: ValueType<1024,75>;  // n x 32 x i32 vector value
-
-def nxv1i64 : ValueType<64,  76>;  // n x  1 x i64 vector value
-def nxv2i64 : ValueType<128, 77>;  // n x  2 x i64 vector value
-def nxv4i64 : ValueType<256, 78>;  // n x  4 x i64 vector value
-def nxv8i64 : ValueType<512, 79>;  // n x  8 x i64 vector value
-def nxv16i64: ValueType<1024,80>;  // n x 16 x i64 vector value
-def nxv32i64: ValueType<2048,81>;  // n x 32 x i64 vector value
-
-def v2f16  : ValueType<32 , 82>;   //  2 x f16 vector value
-def v4f16  : ValueType<64 , 83>;   //  4 x f16 vector value
-def v8f16  : ValueType<128, 84>;   //  8 x f16 vector value
-def v1f32  : ValueType<32 , 85>;   //  1 x f32 vector value
-def v2f32  : ValueType<64 , 86>;   //  2 x f32 vector value
-def v4f32  : ValueType<128, 87>;   //  4 x f32 vector value
-def v8f32  : ValueType<256, 88>;   //  8 x f32 vector value
-def v16f32 : ValueType<512, 89>;   // 16 x f32 vector value
-def v1f64  : ValueType<64,  90>;   //  1 x f64 vector value
-def v2f64  : ValueType<128, 91>;   //  2 x f64 vector value
-def v4f64  : ValueType<256, 92>;   //  4 x f64 vector value
-def v8f64  : ValueType<512, 93>;   //  8 x f64 vector value
-
-def nxv2f16  : ValueType<32 ,  94>; // n x  2 x f16 vector value
-def nxv4f16  : ValueType<64 ,  95>; // n x  4 x f16 vector value
-def nxv8f16  : ValueType<128,  96>; // n x  8 x f16 vector value
-def nxv1f32  : ValueType<32 ,  97>; // n x  1 x f32 vector value
-def nxv2f32  : ValueType<64 ,  98>; // n x  2 x f32 vector value
-def nxv4f32  : ValueType<128,  99>; // n x  4 x f32 vector value
-def nxv8f32  : ValueType<256, 100>; // n x  8 x f32 vector value
-def nxv16f32 : ValueType<512, 101>; // n x 16 x f32 vector value
-def nxv1f64  : ValueType<64,  102>; // n x  1 x f64 vector value
-def nxv2f64  : ValueType<128, 103>; // n x  2 x f64 vector value
-def nxv4f64  : ValueType<256, 104>; // n x  4 x f64 vector value
-def nxv8f64  : ValueType<512, 105>; // n x  8 x f64 vector value
-
-def x86mmx : ValueType<64 , 106>;   // X86 MMX value
-def FlagVT : ValueType<0  , 107>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 108>;   // Produces no value
-def untyped: ValueType<8  , 109>;   // Produces an untyped value
+def v1i1   : ValueType<1 ,  14>;   //   1 x i1 vector value
+def v2i1   : ValueType<2 ,  15>;   //   2 x i1 vector value
+def v4i1   : ValueType<4 ,  16>;   //   4 x i1 vector value
+def v8i1   : ValueType<8 ,  17>;   //   8 x i1 vector value
+def v16i1  : ValueType<16,  18>;   //  16 x i1 vector value
+def v32i1  : ValueType<32 , 19>;   //  32 x i1 vector value
+def v64i1  : ValueType<64 , 20>;   //  64 x i1 vector value
+def v512i1 : ValueType<512, 21>;   // 512 x i1 vector value
+def v1024i1: ValueType<1024,22>;   //1024 x i1 vector value
+
+def v1i8   : ValueType<8,  23>;   //  1 x i8  vector value
+def v2i8   : ValueType<16 , 24>;   //  2 x i8  vector value
+def v4i8   : ValueType<32 , 25>;   //  4 x i8  vector value
+def v8i8   : ValueType<64 , 26>;   //  8 x i8  vector value
+def v16i8  : ValueType<128, 27>;   // 16 x i8  vector value
+def v32i8  : ValueType<256, 28>;   // 32 x i8  vector value
+def v64i8  : ValueType<512, 29>;   // 64 x i8  vector value
+def v128i8 : ValueType<1024,30>;   //128 x i8  vector value
+def v256i8 : ValueType<2048,31>;   //256 x i8  vector value
+
+def v1i16  : ValueType<16 , 32>;   //  1 x i16 vector value
+def v2i16  : ValueType<32 , 33>;   //  2 x i16 vector value
+def v4i16  : ValueType<64 , 34>;   //  4 x i16 vector value
+def v8i16  : ValueType<128, 35>;   //  8 x i16 vector value
+def v16i16 : ValueType<256, 36>;   // 16 x i16 vector value
+def v32i16 : ValueType<512, 37>;   // 32 x i16 vector value
+def v64i16 : ValueType<1024,38>;   // 64 x i16 vector value
+def v128i16: ValueType<2048,39>;   //128 x i16 vector value
+
+def v1i32  : ValueType<32 , 40>;   //  1 x i32 vector value
+def v2i32  : ValueType<64 , 41>;   //  2 x i32 vector value
+def v4i32  : ValueType<128, 42>;   //  4 x i32 vector value
+def v8i32  : ValueType<256, 43>;   //  8 x i32 vector value
+def v16i32 : ValueType<512, 44>;   // 16 x i32 vector value
+def v32i32 : ValueType<1024,45>;   // 32 x i32 vector value
+def v64i32 : ValueType<2048,46>;   // 32 x i32 vector value
+
+def v1i64  : ValueType<64 , 47>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 48>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 49>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 50>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,51>;   // 16 x i64 vector value
+def v32i64 : ValueType<2048,52>;   // 32 x i64 vector value
+
+def v1i128 : ValueType<128, 53>;   //  1 x i128 vector value
+
+def nxv1i1  : ValueType<1,   54>;  // n x  1 x i1  vector value
+def nxv2i1  : ValueType<2,   55>;  // n x  2 x i1  vector value
+def nxv4i1  : ValueType<4,   56>;  // n x  4 x i1  vector value
+def nxv8i1  : ValueType<8,   57>;  // n x  8 x i1  vector value
+def nxv16i1 : ValueType<16,  58>;  // n x 16 x i1  vector value
+def nxv32i1 : ValueType<32,  59>;  // n x 32 x i1  vector value
+
+def nxv1i8  : ValueType<8,   60>;  // n x  1 x i8  vector value
+def nxv2i8  : ValueType<16,  61>;  // n x  2 x i8  vector value
+def nxv4i8  : ValueType<32,  62>;  // n x  4 x i8  vector value
+def nxv8i8  : ValueType<64,  63>;  // n x  8 x i8  vector value
+def nxv16i8 : ValueType<128, 64>;  // n x 16 x i8  vector value
+def nxv32i8 : ValueType<256, 65>;  // n x 32 x i8  vector value
+
+def nxv1i16 : ValueType<16,  66>;  // n x  1 x i16 vector value
+def nxv2i16 : ValueType<32,  67>;  // n x  2 x i16 vector value
+def nxv4i16 : ValueType<64,  68>;  // n x  4 x i16 vector value
+def nxv8i16 : ValueType<128, 69>;  // n x  8 x i16 vector value
+def nxv16i16: ValueType<256, 70>;  // n x 16 x i16 vector value
+def nxv32i16: ValueType<512, 71>;  // n x 32 x i16 vector value
+
+def nxv1i32 : ValueType<32,  72>;  // n x  1 x i32 vector value
+def nxv2i32 : ValueType<64,  73>;  // n x  2 x i32 vector value
+def nxv4i32 : ValueType<128, 74>;  // n x  4 x i32 vector value
+def nxv8i32 : ValueType<256, 75>;  // n x  8 x i32 vector value
+def nxv16i32: ValueType<512, 76>;  // n x 16 x i32 vector value
+def nxv32i32: ValueType<1024,77>;  // n x 32 x i32 vector value
+
+def nxv1i64 : ValueType<64,  78>;  // n x  1 x i64 vector value
+def nxv2i64 : ValueType<128, 79>;  // n x  2 x i64 vector value
+def nxv4i64 : ValueType<256, 80>;  // n x  4 x i64 vector value
+def nxv8i64 : ValueType<512, 81>;  // n x  8 x i64 vector value
+def nxv16i64: ValueType<1024,82>;  // n x 16 x i64 vector value
+def nxv32i64: ValueType<2048,83>;  // n x 32 x i64 vector value
+
+def v2f16  : ValueType<32 , 84>;   //  2 x f16 vector value
+def v4f16  : ValueType<64 , 85>;   //  4 x f16 vector value
+def v8f16  : ValueType<128, 86>;   //  8 x f16 vector value
+def v1f32  : ValueType<32 , 87>;   //  1 x f32 vector value
+def v2f32  : ValueType<64 , 88>;   //  2 x f32 vector value
+def v4f32  : ValueType<128, 89>;   //  4 x f32 vector value
+def v8f32  : ValueType<256, 90>;   //  8 x f32 vector value
+def v16f32 : ValueType<512, 91>;   // 16 x f32 vector value
+def v1f64  : ValueType<64,  92>;   //  1 x f64 vector value
+def v2f64  : ValueType<128, 93>;   //  2 x f64 vector value
+def v4f64  : ValueType<256, 94>;   //  4 x f64 vector value
+def v8f64  : ValueType<512, 95>;   //  8 x f64 vector value
+
+def nxv2f16  : ValueType<32 ,  96>; // n x  2 x f16 vector value
+def nxv4f16  : ValueType<64 ,  97>; // n x  4 x f16 vector value
+def nxv8f16  : ValueType<128,  98>; // n x  8 x f16 vector value
+def nxv1f32  : ValueType<32 ,  99>; // n x  1 x f32 vector value
+def nxv2f32  : ValueType<64 , 100>; // n x  2 x f32 vector value
+def nxv4f32  : ValueType<128, 101>; // n x  4 x f32 vector value
+def nxv8f32  : ValueType<256, 102>; // n x  8 x f32 vector value
+def nxv16f32 : ValueType<512, 103>; // n x 16 x f32 vector value
+def nxv1f64  : ValueType<64,  104>; // n x  1 x f64 vector value
+def nxv2f64  : ValueType<128, 105>; // n x  2 x f64 vector value
+def nxv4f64  : ValueType<256, 106>; // n x  4 x f64 vector value
+def nxv8f64  : ValueType<512, 107>; // n x  8 x f64 vector value
+
+def x86mmx : ValueType<64 , 108>;   // X86 MMX value
+def FlagVT : ValueType<0  , 109>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 110>;   // Produces no value
+def untyped: ValueType<8  , 111>;   // Produces an untyped value
 def token  : ValueType<0  , 248>;   // TokenTy
 def MetadataVT: ValueType<0, 249>;  // Metadata
 
diff --git a/include/llvm/DebugInfo/CodeView/CVRecord.h b/include/llvm/DebugInfo/CodeView/CVRecord.h
index ac8aaafeadc1..71ea82b6a9ab 100644
--- a/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -32,6 +32,10 @@ public:
   uint32_t length() const { return RecordData.size(); }
   Kind kind() const { return Type; }
   ArrayRef<uint8_t> data() const { return RecordData; }
+  StringRef str_data() const {
+    return StringRef(reinterpret_cast<const char *>(RecordData.data()),
+                     RecordData.size());
+  }
 
   ArrayRef<uint8_t> content() const {
     return RecordData.drop_front(sizeof(RecordPrefix));
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeDumper.h b/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
deleted file mode 100644
index 02f14ea2107b..000000000000
--- a/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===-- CVTypeDumper.h - CodeView type info dumper --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_CODEVIEW_CVTYPEDUMPER_H
-#define LLVM_DEBUGINFO_CODEVIEW_CVTYPEDUMPER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/Support/ScopedPrinter.h"
-
-namespace llvm {
-
-namespace codeview {
-
-class TypeServerHandler;
-
-/// Dumper for CodeView type streams found in COFF object files and PDB files.
-class CVTypeDumper {
-public:
-  explicit CVTypeDumper(TypeDatabase &TypeDB,
-                        TypeServerHandler *Handler = nullptr)
-      : TypeDB(TypeDB), Handler(Handler) {}
-
-  /// Dumps one type record.  Returns false if there was a type parsing error,
-  /// and true otherwise.  This should be called in order, since the dumper
-  /// maintains state about previous records which are necessary for cross
-  /// type references.
-  Error dump(const CVType &Record, TypeVisitorCallbacks &Dumper);
-
-  /// Dumps the type records in Types. Returns false if there was a type stream
-  /// parse error, and true otherwise.
-  Error dump(const CVTypeArray &Types, TypeVisitorCallbacks &Dumper);
-
-  /// Dumps the type records in Data. Returns false if there was a type stream
-  /// parse error, and true otherwise. Use this method instead of the
-  /// CVTypeArray overload when type records are laid out contiguously in
-  /// memory.
-  Error dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper);
-
-  static void printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
-                             TypeIndex TI, TypeDatabase &DB);
-
-private:
-  TypeDatabase &TypeDB;
-  TypeServerHandler *Handler;
-};
-
-} // end namespace codeview
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_CODEVIEW_TYPEDUMPER_H
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index 6d9f345755ab..4bc8fbefd5d8 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -10,42 +10,15 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_CVTYPEVISITOR_H
 #define LLVM_DEBUGINFO_CODEVIEW_CVTYPEVISITOR_H
 
-#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace codeview {
-
-class CVTypeVisitor {
-public:
-  explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks);
-
-  void addTypeServerHandler(TypeServerHandler &Handler);
-
-  Error visitTypeRecord(CVType &Record, TypeIndex Index);
-  Error visitTypeRecord(CVType &Record);
-  Error visitMemberRecord(CVMemberRecord Record);
-
-  /// Visits the type records in Data. Sets the error flag on parse failures.
-  Error visitTypeStream(const CVTypeArray &Types);
-  Error visitTypeStream(CVTypeRange Types);
-
-  Error visitFieldListMemberStream(ArrayRef<uint8_t> FieldList);
-  Error visitFieldListMemberStream(BinaryStreamReader Reader);
-
-private:
-  Expected<bool> handleTypeServer(CVType &Record);
-  Error finishVisitation(CVType &Record);
-
-  /// The interface to the class that gets notified of each visitation.
-  TypeVisitorCallbacks &Callbacks;
-
-  TinyPtrVector<TypeServerHandler *> Handlers;
-};
+class TypeCollection;
+class TypeServerHandler;
+class TypeVisitorCallbacks;
 
 enum VisitorDataSource {
   VDS_BytesPresent, // The record bytes are passed into the the visitation
@@ -76,6 +49,8 @@ Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks,
                       TypeServerHandler *TS = nullptr);
 Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks,
                       TypeServerHandler *TS = nullptr);
+Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks,
+                      TypeServerHandler *TS = nullptr);
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h b/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
index 21288df89be2..0d056e42b45f 100644
--- a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h
@@ -1,4 +1,4 @@
-//===- RandomAccessTypeVisitor.h ------------------------------ *- C++ --*-===//
+//===- LazyRandomTypeCollection.h ---------------------------- *- C++ --*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
-#define LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
+#ifndef LLVM_DEBUGINFO_CODEVIEW_LAZYRANDOMTYPECOLLECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_LAZYRANDOMTYPECOLLECTION_H
 
-#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
@@ -21,7 +21,6 @@ namespace llvm {
 namespace codeview {
 
 class TypeDatabase;
-class TypeServerHandler;
 class TypeVisitorCallbacks;
 
 /// \brief Provides amortized O(1) random access to a CodeView type stream.
@@ -40,32 +39,48 @@ class TypeVisitorCallbacks;
 /// consumer much better access time, because the consumer can find the nearest
 /// index in this array, and do a linear scan forward only from there.
 ///
-/// RandomAccessTypeVisitor implements this algorithm, but additionally goes one
-/// step further by caching offsets of every record that has been visited at
+/// LazyRandomTypeCollection implements this algorithm, but additionally goes
+/// one step further by caching offsets of every record that has been visited at
 /// least once.  This way, even repeated visits of the same record will never
 /// require more than one linear scan.  For a type stream of N elements divided
 /// into M chunks of roughly equal size, this yields a worst case lookup time
 /// of O(N/M) and an amortized time of O(1).
-class RandomAccessTypeVisitor {
+class LazyRandomTypeCollection : public TypeCollection {
   typedef FixedStreamArray<TypeIndexOffset> PartialOffsetArray;
 
 public:
-  RandomAccessTypeVisitor(const CVTypeArray &Types, uint32_t NumRecords,
-                          PartialOffsetArray PartialOffsets);
-
-  Error visitTypeIndex(TypeIndex Index, TypeVisitorCallbacks &Callbacks);
+  explicit LazyRandomTypeCollection(uint32_t RecordCountHint);
+  LazyRandomTypeCollection(StringRef Data, uint32_t RecordCountHint);
+  LazyRandomTypeCollection(ArrayRef<uint8_t> Data, uint32_t RecordCountHint);
+  LazyRandomTypeCollection(const CVTypeArray &Types, uint32_t RecordCountHint,
+                           PartialOffsetArray PartialOffsets);
+  LazyRandomTypeCollection(const CVTypeArray &Types, uint32_t RecordCountHint);
+
+  void reset(ArrayRef<uint8_t> Data);
+  void reset(StringRef Data);
+
+  CVType getType(TypeIndex Index) override;
+  StringRef getTypeName(TypeIndex Index) override;
+  bool contains(TypeIndex Index) override;
+  uint32_t size() override;
+  uint32_t capacity() override;
+  Optional<TypeIndex> getFirst() override;
+  Optional<TypeIndex> getNext(TypeIndex Prev) override;
 
+private:
   const TypeDatabase &database() const { return Database; }
+  Error ensureTypeExists(TypeIndex Index);
 
-private:
   Error visitRangeForType(TypeIndex TI);
+  Error fullScanForType(TypeIndex TI);
   Error visitRange(TypeIndex Begin, uint32_t BeginOffset, TypeIndex End);
+  Error visitOneRecord(TypeIndex TI, uint32_t Offset, CVType &Record);
 
   /// Visited records get automatically added to the type database.
   TypeDatabase Database;
 
   /// The type array to allow random access visitation of.
-  const CVTypeArray &Types;
+  CVTypeArray Types;
 
   /// The database visitor which adds new records to the database.
   TypeDatabaseVisitor DatabaseVisitor;
@@ -85,4 +100,4 @@ private:
 } // end namespace codeview
 } // end namespace llvm
 
-#endif // LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
+#endif // LLVM_DEBUGINFO_CODEVIEW_LAZYRANDOMTYPECOLLECTION_H
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDumper.h b/include/llvm/DebugInfo/CodeView/SymbolDumper.h
index a5419b37e776..e91065dcf87e 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDumper.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDumper.h
@@ -20,15 +20,15 @@ namespace llvm {
 class ScopedPrinter;
 
 namespace codeview {
-class TypeDatabase;
+class TypeCollection;
 
 /// Dumper for CodeView symbol streams found in COFF object files and PDB files.
 class CVSymbolDumper {
 public:
-  CVSymbolDumper(ScopedPrinter &W, TypeDatabase &TypeDB,
+  CVSymbolDumper(ScopedPrinter &W, TypeCollection &Types,
                  std::unique_ptr<SymbolDumpDelegate> ObjDelegate,
                  bool PrintRecordBytes)
-      : W(W), TypeDB(TypeDB), ObjDelegate(std::move(ObjDelegate)),
+      : W(W), Types(Types), ObjDelegate(std::move(ObjDelegate)),
         PrintRecordBytes(PrintRecordBytes) {}
 
   /// Dumps one type record.  Returns false if there was a type parsing error,
@@ -43,7 +43,7 @@ public:
 
 private:
   ScopedPrinter &W;
-  TypeDatabase &TypeDB;
+  TypeCollection &Types;
   std::unique_ptr<SymbolDumpDelegate> ObjDelegate;
 
   bool PrintRecordBytes;
diff --git a/include/llvm/DebugInfo/CodeView/TypeCollection.h b/include/llvm/DebugInfo/CodeView/TypeCollection.h
new file mode 100644
index 000000000000..0f856f57a727
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeCollection.h
@@ -0,0 +1,38 @@
+//===- TypeCollection.h - A collection of CodeView type records -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPECOLLECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPECOLLECTION_H
+
+#include "llvm/ADT/StringRef.h"
+
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+
+namespace llvm {
+namespace codeview {
+class TypeCollection {
+public:
+  virtual ~TypeCollection() = default;
+
+  bool empty() { return size() == 0; }
+
+  virtual Optional<TypeIndex> getFirst() = 0;
+  virtual Optional<TypeIndex> getNext(TypeIndex Prev) = 0;
+
+  virtual CVType getType(TypeIndex Index) = 0;
+  virtual StringRef getTypeName(TypeIndex Index) = 0;
+  virtual bool contains(TypeIndex Index) = 0;
+  virtual uint32_t size() = 0;
+  virtual uint32_t capacity() = 0;
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabase.h b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
index 92c15ebd8b2b..a743e7f70855 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDatabase.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
@@ -13,6 +13,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Allocator.h"
@@ -20,7 +21,7 @@
 
 namespace llvm {
 namespace codeview {
-class TypeDatabase {
+class TypeDatabase : public TypeCollection {
   friend class RandomAccessTypeVisitor;
 
 public:
@@ -41,19 +42,31 @@ public:
   CVType &getTypeRecord(TypeIndex Index);
 
   bool contains(TypeIndex Index) const;
-
   uint32_t size() const;
   uint32_t capacity() const;
   bool empty() const;
 
-  TypeIndex getAppendIndex() const;
+  CVType getType(TypeIndex Index) override;
+  StringRef getTypeName(TypeIndex Index) override;
+  bool contains(TypeIndex Index) override;
+  uint32_t size() override;
+  uint32_t capacity() override;
+
+  Optional<TypeIndex> getFirst() override;
+  Optional<TypeIndex> getNext(TypeIndex Prev) override;
+
+  Optional<TypeIndex> largestTypeIndexLessThan(TypeIndex TI) const;
 
 private:
+  TypeIndex getAppendIndex() const;
+
   void grow();
+  void grow(TypeIndex Index);
 
   BumpPtrAllocator Allocator;
 
   uint32_t Count = 0;
+  TypeIndex LargestTypeIndex;
 
   /// All user defined type records in .debug$T live in here. Type indices
   /// greater than 0x1000 are user defined. Subtract 0x1000 from the index to
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
index 6f10afb30d60..65b3a33e6548 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
@@ -12,7 +12,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
@@ -22,17 +21,20 @@ class ScopedPrinter;
 
 namespace codeview {
 
+class TypeCollection;
+
 /// Dumper for CodeView type streams found in COFF object files and PDB files.
 class TypeDumpVisitor : public TypeVisitorCallbacks {
 public:
-  TypeDumpVisitor(TypeDatabase &TypeDB, ScopedPrinter *W, bool PrintRecordBytes)
-      : W(W), PrintRecordBytes(PrintRecordBytes), TypeDB(TypeDB) {}
+  TypeDumpVisitor(TypeCollection &TpiTypes, ScopedPrinter *W,
+                  bool PrintRecordBytes)
+      : W(W), PrintRecordBytes(PrintRecordBytes), TpiTypes(TpiTypes) {}
 
   /// When dumping types from an IPI stream in a PDB, a type index may refer to
   /// a type or an item ID. The dumper will lookup the "name" of the index in
   /// the item database if appropriate. If ItemDB is null, it will use TypeDB,
   /// which is correct when dumping types from an object file (/Z7).
-  void setItemDB(TypeDatabase &DB) { ItemDB = &DB; }
+  void setIpiTypes(TypeCollection &Types) { IpiTypes = &Types; }
 
   void printTypeIndex(StringRef FieldName, TypeIndex TI) const;
 
@@ -66,14 +68,16 @@ private:
   /// Get the database of indices for the stream that we are dumping. If ItemDB
   /// is set, then we must be dumping an item (IPI) stream. This will also
   /// always get the appropriate DB for printing item names.
-  TypeDatabase &getSourceDB() const { return ItemDB ? *ItemDB : TypeDB; }
+  TypeCollection &getSourceTypes() const {
+    return IpiTypes ? *IpiTypes : TpiTypes;
+  }
 
   ScopedPrinter *W;
 
   bool PrintRecordBytes = false;
 
-  TypeDatabase &TypeDB;
-  TypeDatabase *ItemDB = nullptr;
+  TypeCollection &TpiTypes;
+  TypeCollection *IpiTypes = nullptr;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/TypeIndex.h b/include/llvm/DebugInfo/CodeView/TypeIndex.h
index b5d695fc49d5..31eed7d3e877 100644
--- a/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -15,8 +15,13 @@
 #include <cinttypes>
 
 namespace llvm {
+
+class ScopedPrinter;
+
 namespace codeview {
 
+class TypeCollection;
+
 enum class SimpleTypeKind : uint32_t {
   None = 0x0000,          // uncharacterized type (no type)
   Void = 0x0003,          // void
@@ -238,6 +243,11 @@ public:
     return Result;
   }
 
+  friend inline uint32_t operator-(const TypeIndex &A, const TypeIndex &B) {
+    assert(A >= B);
+    return A.toArrayIndex() - B.toArrayIndex();
+  }
+
 private:
   support::ulittle32_t Index;
 };
@@ -249,6 +259,9 @@ struct TypeIndexOffset {
   TypeIndex Type;
   support::ulittle32_t Offset;
 };
+
+void printTypeIndex(ScopedPrinter &Printer, StringRef FieldName, TypeIndex TI,
+                    TypeCollection &Types);
 }
 }
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
index 1f4873c4f969..6dad98247136 100644
--- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
@@ -70,6 +70,8 @@ class TypeSerializer : public TypeVisitorCallbacks {
   MutableArrayRef<uint8_t> getCurrentRecordData();
   Error writeRecordPrefix(TypeLeafKind Kind);
   TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
+  TypeIndex insertRecordBytesWithCopy(CVType &Record,
+                                      MutableArrayRef<uint8_t> Data);
 
   Expected<MutableArrayRef<uint8_t>>
   addPadding(MutableArrayRef<uint8_t> Record);
diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 2246f197e784..65bcf9812e68 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -12,17 +12,20 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace codeview {
 
+class TypeIndex;
 class TypeServerHandler;
+class TypeTableBuilder;
 
 /// Merges one type stream into another. Returns true on success.
 Error mergeTypeStreams(TypeTableBuilder &DestIdStream,
                        TypeTableBuilder &DestTypeStream,
+                       SmallVectorImpl<TypeIndex> &SourceToDest,
                        TypeServerHandler *Handler, const CVTypeArray &Types);
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
new file mode 100644
index 000000000000..7de562a19a74
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
@@ -0,0 +1,42 @@
+//===- TypeTableCollection.h ---------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPETABLECOLLECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPETABLECOLLECTION_H
+
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+
+namespace llvm {
+namespace codeview {
+
+class TypeTableCollection : public TypeCollection {
+public:
+  explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
+
+  Optional<TypeIndex> getFirst() override;
+  Optional<TypeIndex> getNext(TypeIndex Prev) override;
+
+  CVType getType(TypeIndex Index) override;
+  StringRef getTypeName(TypeIndex Index) override;
+  bool contains(TypeIndex Index) override;
+  uint32_t size() override;
+  uint32_t capacity() override;
+
+private:
+  bool hasCapacityFor(TypeIndex Index) const;
+  void ensureTypeExists(TypeIndex Index);
+
+  ArrayRef<MutableArrayRef<uint8_t>> Records;
+  TypeDatabase Database;
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
index 2950c7d27cb6..0ea754deb425 100644
--- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
+++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
@@ -17,8 +17,6 @@ namespace llvm {
 namespace codeview {
 
 class TypeVisitorCallbacks {
-  friend class CVTypeVisitor;
-
 public:
   virtual ~TypeVisitorCallbacks() = default;
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
index a46d46a5bff3..46c0b7f4ce60 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -19,8 +19,9 @@ class DWARFCompileUnit : public DWARFUnit {
 public:
   DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
                    const DWARFDebugAbbrev *DA, const DWARFSection *RS,
-                   StringRef SS, StringRef SOS, StringRef AOS, StringRef LS,
-                   bool LE, bool IsDWO, const DWARFUnitSectionBase &UnitSection,
+                   StringRef SS, StringRef SOS, const DWARFSection *AOS,
+                   StringRef LS, bool LE, bool IsDWO,
+                   const DWARFUnitSectionBase &UnitSection,
                    const DWARFUnitIndex::Entry *Entry)
       : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
                   UnitSection, Entry) {}
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index ca82a68ead31..d3a63edf10ff 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -235,7 +235,7 @@ public:
   virtual StringRef getStringDWOSection() = 0;
   virtual StringRef getStringOffsetDWOSection() = 0;
   virtual const DWARFSection &getRangeDWOSection() = 0;
-  virtual StringRef getAddrSection() = 0;
+  virtual const DWARFSection &getAddrSection() = 0;
   virtual const DWARFSection& getAppleNamesSection() = 0;
   virtual const DWARFSection& getAppleTypesSection() = 0;
   virtual const DWARFSection& getAppleNamespacesSection() = 0;
@@ -290,7 +290,7 @@ class DWARFContextInMemory : public DWARFContext {
   StringRef StringDWOSection;
   StringRef StringOffsetDWOSection;
   DWARFSection RangeDWOSection;
-  StringRef AddrSection;
+  DWARFSection AddrSection;
   DWARFSection AppleNamesSection;
   DWARFSection AppleTypesSection;
   DWARFSection AppleNamespacesSection;
@@ -356,9 +356,7 @@ public:
 
   const DWARFSection &getRangeDWOSection() override { return RangeDWOSection; }
 
-  StringRef getAddrSection() override {
-    return AddrSection;
-  }
+  const DWARFSection &getAddrSection() override { return AddrSection; }
 
   StringRef getCUIndexSection() override { return CUIndexSection; }
   StringRef getGdbIndexSection() override { return GdbIndexSection; }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index ec0397a0fb09..fabacc0abcea 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -17,7 +17,7 @@
 namespace llvm {
 
 struct RelocAddrEntry {
-  int64_t Value;
+  uint64_t Value;
 };
 
 /// In place of applying the relocations to the data we've read from disk we use
diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index c9da2c9a3e16..c77d946c070a 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -31,8 +31,9 @@ private:
 public:
   DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
                 const DWARFDebugAbbrev *DA, const DWARFSection *RS,
-                StringRef SS, StringRef SOS, StringRef AOS, StringRef LS,
-                bool LE, bool IsDWO, const DWARFUnitSectionBase &UnitSection,
+                StringRef SS, StringRef SOS, const DWARFSection *AOS,
+                StringRef LS, bool LE, bool IsDWO,
+                const DWARFUnitSectionBase &UnitSection,
                 const DWARFUnitIndex::Entry *Entry)
       : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
                   UnitSection, Entry) {}
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index c15e27f36a8b..ae7fd24ce5bb 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -57,7 +57,7 @@ protected:
 
   virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section,
                          const DWARFDebugAbbrev *DA, const DWARFSection *RS,
-                         StringRef SS, StringRef SOS, StringRef AOS,
+                         StringRef SS, StringRef SOS, const DWARFSection *AOS,
                          StringRef LS, bool isLittleEndian, bool isDWO) = 0;
 };
 
@@ -89,8 +89,8 @@ public:
 private:
   void parseImpl(DWARFContext &Context, const DWARFSection &Section,
                  const DWARFDebugAbbrev *DA, const DWARFSection *RS,
-                 StringRef SS, StringRef SOS, StringRef AOS, StringRef LS,
-                 bool LE, bool IsDWO) override {
+                 StringRef SS, StringRef SOS, const DWARFSection *AOS,
+                 StringRef LS, bool LE, bool IsDWO) override {
     if (Parsed)
       return;
     const auto &Index = getDWARFUnitIndex(Context, UnitType::Section);
@@ -120,7 +120,7 @@ class DWARFUnit {
   StringRef LineSection;
   StringRef StringSection;
   StringRef StringOffsetSection;
-  StringRef AddrOffsetSection;
+  const DWARFSection *AddrOffsetSection;
   uint32_t AddrOffsetSectionBase;
   bool isLittleEndian;
   bool isDWO;
@@ -149,7 +149,7 @@ class DWARFUnit {
     DWARFUnit *DWOU = nullptr;
 
   public:
-    DWOHolder(StringRef DWOPath);
+    DWOHolder(StringRef DWOPath, uint64_t DWOId);
 
     DWARFUnit *getUnit() const { return DWOU; }
   };
@@ -172,8 +172,8 @@ protected:
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
             const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
-            StringRef SOS, StringRef AOS, StringRef LS, bool LE, bool IsDWO,
-            const DWARFUnitSectionBase &UnitSection,
+            StringRef SOS, const DWARFSection *AOS, StringRef LS, bool LE,
+            bool IsDWO, const DWARFUnitSectionBase &UnitSection,
             const DWARFUnitIndex::Entry *IndexEntry = nullptr);
 
   virtual ~DWARFUnit();
@@ -184,7 +184,7 @@ public:
   StringRef getStringSection() const { return StringSection; }
   StringRef getStringOffsetSection() const { return StringOffsetSection; }
 
-  void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
+  void setAddrOffsetSection(const DWARFSection *AOS, uint32_t Base) {
     AddrOffsetSection = AOS;
     AddrOffsetSectionBase = Base;
   }
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
index d965e1008e95..bfd38b6c80ec 100644
--- a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
+++ b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
@@ -13,7 +13,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index 6c609c34665c..21cfa83e6af4 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -72,7 +72,7 @@ private:
 
   size_t TypeRecordBytes = 0;
 
-  Optional<PdbRaw_TpiVer> VerHeader;
+  PdbRaw_TpiVer VerHeader = PdbRaw_TpiVer::PdbTpiV80;
   std::vector<ArrayRef<uint8_t>> TypeRecords;
   std::vector<uint32_t> TypeHashes;
   std::vector<codeview::TypeIndexOffset> TypeIndexOffsets;
diff --git a/include/llvm/IR/Argument.h b/include/llvm/IR/Argument.h
index 5ed6d030c984..3efcc637b6ed 100644
--- a/include/llvm/IR/Argument.h
+++ b/include/llvm/IR/Argument.h
@@ -27,8 +27,7 @@ namespace llvm {
 /// for a specific function. When used in the body of said function, the
 /// argument of course represents the value of the actual argument that the
 /// function was called with.
-class Argument : public Value {
-  virtual void anchor();
+class Argument final : public Value {
   Function *Parent;
   unsigned ArgNo;
 
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 97989cf5c652..c917b1f2cada 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -51,8 +51,8 @@ class ValueSymbolTable;
 /// occur because it may be useful in the intermediate stage of constructing or
 /// modifying a program. However, the verifier will ensure that basic blocks
 /// are "well formed".
-class BasicBlock : public Value, // Basic blocks are data objects also
-                   public ilist_node_with_parent<BasicBlock, Function> {
+class BasicBlock final : public Value, // Basic blocks are data objects also
+                         public ilist_node_with_parent<BasicBlock, Function> {
 public:
   using InstListType = SymbolTableList<Instruction>;
 
@@ -77,7 +77,7 @@ private:
 public:
   BasicBlock(const BasicBlock &) = delete;
   BasicBlock &operator=(const BasicBlock &) = delete;
-  ~BasicBlock() override;
+  ~BasicBlock();
 
   /// \brief Get the context in which this basic block lives.
   LLVMContext &getContext() const;
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index 3b3694e7e60d..82afd9a2691f 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -40,8 +40,6 @@ class APInt;
 /// don't have to worry about the lifetime of the objects.
 /// @brief LLVM Constant Representation
 class Constant : public User {
-  void anchor() override;
-
 protected:
   Constant(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps)
     : User(ty, vty, Ops, NumOps) {}
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 5db9b3bb5048..40a8d1eb27d0 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -58,8 +58,6 @@ template <class ConstantClass> struct ConstantAggrKeyType;
 class ConstantData : public Constant {
   friend class Constant;
 
-  void anchor() override;
-
   Value *handleOperandChangeImpl(Value *From, Value *To) {
     llvm_unreachable("Constant data does not have operands!");
   }
@@ -93,7 +91,6 @@ class ConstantInt final : public ConstantData {
 
   ConstantInt(IntegerType *Ty, const APInt& V);
 
-  void anchor() override;
   void destroyConstantImpl();
 
 public:
@@ -274,7 +271,6 @@ class ConstantFP final : public ConstantData {
 
   ConstantFP(Type *Ty, const APFloat& V);
 
-  void anchor() override;
   void destroyConstantImpl();
 
 public:
@@ -588,7 +584,7 @@ class ConstantDataSequential : public ConstantData {
 protected:
   explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data)
       : ConstantData(ty, VT), DataElements(Data), Next(nullptr) {}
-  ~ConstantDataSequential() override { delete Next; }
+  ~ConstantDataSequential() { delete Next; }
 
   static Constant *getImpl(StringRef Bytes, Type *Ty);
 
@@ -638,8 +634,8 @@ public:
   /// The size of the elements is known to be a multiple of one byte.
   uint64_t getElementByteSize() const;
 
-  /// This method returns true if this is an array of i8.
-  bool isString() const;
+  /// This method returns true if this is an array of \p CharSize integers.
+  bool isString(unsigned CharSize = 8) const;
 
   /// This method returns true if the array "isString", ends with a null byte,
   /// and does not contains any other null bytes.
@@ -692,8 +688,6 @@ class ConstantDataArray final : public ConstantDataSequential {
     return User::operator new(s, 0);
   }
 
-  void anchor() override;
-
 public:
   ConstantDataArray(const ConstantDataArray &) = delete;
 
@@ -755,8 +749,6 @@ class ConstantDataVector final : public ConstantDataSequential {
     return User::operator new(s, 0);
   }
 
-  void anchor() override;
-
 public:
   ConstantDataVector(const ConstantDataVector &) = delete;
 
diff --git a/include/llvm/IR/DerivedUser.h b/include/llvm/IR/DerivedUser.h
new file mode 100644
index 000000000000..4d681e0db611
--- /dev/null
+++ b/include/llvm/IR/DerivedUser.h
@@ -0,0 +1,41 @@
+//===-- DerivedUser.h - Base for non-IR Users -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_DERIVEDUSER_H
+#define LLVM_IR_DERIVEDUSER_H
+
+#include "llvm/IR/User.h"
+
+namespace llvm {
+
+/// Extension point for the Value hierarchy. All classes outside of lib/IR
+/// that wish to inherit from User should instead inherit from DerivedUser
+/// instead. Inheriting from this class is discouraged.
+///
+/// Generally speaking, Value is the base of a closed class hierarchy
+/// that can't be extended by code outside of lib/IR. This class creates a
+/// loophole that allows classes outside of lib/IR to extend User to leverage
+/// its use/def list machinery.
+class DerivedUser : public User {
+protected:
+  typedef void (*DeleteValueTy)(DerivedUser *);
+
+private:
+  friend Value;
+  DeleteValueTy DeleteValue;
+
+public:
+  DerivedUser(Type *Ty, unsigned VK, Use *U, unsigned NumOps,
+              DeleteValueTy DeleteValue)
+      : User(Ty, VK, U, NumOps), DeleteValue(DeleteValue) {}
+};
+
+} // namespace llvm
+
+#endif // LLVM_IR_DERIVEDUSER_H
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 8a2a6ed87eb2..f27e5c50a47f 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -123,7 +123,7 @@ private:
 public:
   Function(const Function&) = delete;
   void operator=(const Function&) = delete;
-  ~Function() override;
+  ~Function();
 
   static Function *Create(FunctionType *Ty, LinkageTypes Linkage,
                           const Twine &N = "", Module *M = nullptr) {
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 0793a1c0ee2e..20495725f9d0 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -161,6 +161,10 @@ protected:
     Parent = parent;
   }
 
+  ~GlobalValue() {
+    removeDeadConstantUsers();   // remove any dead constants using this.
+  }
+
 public:
   enum ThreadLocalMode {
     NotThreadLocal = 0,
@@ -172,10 +176,6 @@ public:
 
   GlobalValue(const GlobalValue &) = delete;
 
-  ~GlobalValue() override {
-    removeDeadConstantUsers();   // remove any dead constants using this.
-  }
-
   unsigned getAlignment() const;
 
   enum class UnnamedAddr {
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 21d334c8f01d..3f5d00bd3b3a 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -66,7 +66,7 @@ public:
   GlobalVariable(const GlobalVariable &) = delete;
   GlobalVariable &operator=(const GlobalVariable &) = delete;
 
-  ~GlobalVariable() override {
+  ~GlobalVariable() {
     dropAllReferences();
 
     // FIXME: needed by operator delete
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index a57e7d63012b..7f03fcd19b65 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -28,7 +28,7 @@ class FunctionType;
 class PointerType;
 template <class ConstantClass> class ConstantUniqueMap;
 
-class InlineAsm : public Value {
+class InlineAsm final : public Value {
 public:
   enum AsmDialect {
     AD_ATT,
@@ -48,7 +48,6 @@ private:
   InlineAsm(FunctionType *Ty, const std::string &AsmString,
             const std::string &Constraints, bool hasSideEffects,
             bool isAlignStack, AsmDialect asmDialect);
-  ~InlineAsm() override;
 
   /// When the ConstantUniqueMap merges two types and makes two InlineAsms
   /// identical, it destroys one of them with this method.
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 61ca90de7393..e850c015d711 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -62,9 +62,6 @@ protected:
                  Use *Ops, unsigned NumOps, BasicBlock *InsertAtEnd)
     : Instruction(Ty, iType, Ops, NumOps, InsertAtEnd) {}
 
-  // Out of line virtual method, so the vtable, etc has a home.
-  ~TerminatorInst() override;
-
 public:
   /// Return the number of successors that this terminator has.
   unsigned getNumSuccessors() const;
@@ -299,9 +296,6 @@ public:
 
   void *operator new(size_t, unsigned) = delete;
 
-  // Out of line virtual method, so the vtable, etc has a home.
-  ~UnaryInstruction() override;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -568,8 +562,6 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BinaryOperator, Value)
 /// if (isa<CastInst>(Instr)) { ... }
 /// @brief Base class of casting instructions.
 class CastInst : public UnaryInstruction {
-  void anchor() override;
-
 protected:
   /// @brief Constructor with insert-before-instruction semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
@@ -914,8 +906,6 @@ protected:
           Value *LHS, Value *RHS, const Twine &Name,
           BasicBlock *InsertAtEnd);
 
-  void anchor() override; // Out of line virtual method.
-
 public:
   CmpInst() = delete;
 
diff --git a/include/llvm/IR/Instruction.def b/include/llvm/IR/Instruction.def
index 18711abb8060..86617299c44a 100644
--- a/include/llvm/IR/Instruction.def
+++ b/include/llvm/IR/Instruction.def
@@ -102,6 +102,10 @@
 #define LAST_OTHER_INST(num)
 #endif
 
+#ifndef HANDLE_USER_INST
+#define HANDLE_USER_INST(num, opc, Class) HANDLE_OTHER_INST(num, opc, Class)
+#endif
+
 // Terminator Instructions - These instructions are used to terminate a basic
 // block of the program.   Every basic block must end with one of these
 // instructions for it to be a well formed basic block.
@@ -185,8 +189,8 @@ HANDLE_OTHER_INST(52, FCmp   , FCmpInst   )  // Floating point comparison instr.
 HANDLE_OTHER_INST(53, PHI    , PHINode    )  // PHI node instruction
 HANDLE_OTHER_INST(54, Call   , CallInst   )  // Call a function
 HANDLE_OTHER_INST(55, Select , SelectInst )  // select instruction
-HANDLE_OTHER_INST(56, UserOp1, Instruction)  // May be used internally in a pass
-HANDLE_OTHER_INST(57, UserOp2, Instruction)  // Internal to passes only
+HANDLE_USER_INST (56, UserOp1, Instruction)  // May be used internally in a pass
+HANDLE_USER_INST (57, UserOp2, Instruction)  // Internal to passes only
 HANDLE_OTHER_INST(58, VAArg  , VAArgInst  )  // vaarg instruction
 HANDLE_OTHER_INST(59, ExtractElement, ExtractElementInst)// extract from vector
 HANDLE_OTHER_INST(60, InsertElement, InsertElementInst)  // insert into vector
@@ -220,6 +224,8 @@ HANDLE_OTHER_INST(64, LandingPad, LandingPadInst)  // Landing pad instruction.
 #undef HANDLE_OTHER_INST
 #undef   LAST_OTHER_INST
 
+#undef HANDLE_USER_INST
+
 #ifdef HANDLE_INST
 #undef HANDLE_INST
 #endif
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index fca29900f4c2..6e109735ddd3 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -36,6 +36,10 @@ class FastMathFlags;
 class MDNode;
 struct AAMDNodes;
 
+template <> struct ilist_alloc_traits<Instruction> {
+  static inline void deleteNode(Instruction *V);
+};
+
 class Instruction : public User,
                     public ilist_node_with_parent<Instruction, BasicBlock> {
   BasicBlock *Parent;
@@ -47,13 +51,13 @@ class Instruction : public User,
     HasMetadataBit = 1 << 15
   };
 
+protected:
+  ~Instruction(); // Use deleteValue() to delete a generic Instruction.
+
 public:
   Instruction(const Instruction &) = delete;
   Instruction &operator=(const Instruction &) = delete;
 
-  // Out of line virtual method, so the vtable, etc has a home.
-  ~Instruction() override;
-
   /// Specialize the methods defined in Value, as we know that an instruction
   /// can only be used by other instructions.
   Instruction       *user_back()       { return cast<Instruction>(*user_begin());}
@@ -640,6 +644,10 @@ private:
   Instruction *cloneImpl() const;
 };
 
+inline void ilist_alloc_traits<Instruction>::deleteNode(Instruction *V) {
+  V->deleteValue();
+}
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INSTRUCTION_H
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index c26701af27ce..6fab59613dd6 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -89,9 +89,6 @@ public:
   AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, unsigned Align,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
-  // Out of line virtual method, so the vtable, etc. has a home.
-  ~AllocaInst() override;
-
   /// Return true if there is an allocation size parameter to the allocation
   /// instruction that is not 1.
   bool isArrayAllocation() const;
@@ -856,7 +853,6 @@ class GetElementPtrInst : public Instruction {
                            ArrayRef<Value *> IdxList, unsigned Values,
                            const Twine &NameStr, BasicBlock *InsertAtEnd);
 
-  void anchor() override;
   void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
 
 protected:
@@ -1112,8 +1108,6 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrInst, Value)
 /// must be identical types.
 /// Represent an integer comparison operator.
 class ICmpInst: public CmpInst {
-  void anchor() override;
-
   void AssertOK() {
     assert(getPredicate() >= CmpInst::FIRST_ICMP_PREDICATE &&
            getPredicate() <= CmpInst::LAST_ICMP_PREDICATE &&
@@ -1426,8 +1420,6 @@ protected:
   CallInst *cloneImpl() const;
 
 public:
-  ~CallInst() override;
-
   static CallInst *Create(Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = None,
                           const Twine &NameStr = "",
@@ -2592,8 +2584,6 @@ class PHINode : public Instruction {
     return User::operator new(s);
   }
 
-  void anchor() override;
-
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -2927,8 +2917,6 @@ protected:
   ReturnInst *cloneImpl() const;
 
 public:
-  ~ReturnInst() override;
-
   static ReturnInst* Create(LLVMContext &C, Value *retVal = nullptr,
                             Instruction *InsertBefore = nullptr) {
     return new(!!retVal) ReturnInst(C, retVal, InsertBefore);
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 8f24a6a1d69d..92f701e01ff3 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -174,12 +174,13 @@ class MetadataAsValue : public Value {
   Metadata *MD;
 
   MetadataAsValue(Type *Ty, Metadata *MD);
-  ~MetadataAsValue() override;
 
   /// \brief Drop use of metadata (during teardown).
   void dropUse() { MD = nullptr; }
 
 public:
+  ~MetadataAsValue();
+
   static MetadataAsValue *get(LLVMContext &Context, Metadata *MD);
   static MetadataAsValue *getIfExists(LLVMContext &Context, Metadata *MD);
   Metadata *getMetadata() const { return MD; }
diff --git a/include/llvm/IR/OperandTraits.h b/include/llvm/IR/OperandTraits.h
index e97a8009ccc0..7b94283856b6 100644
--- a/include/llvm/IR/OperandTraits.h
+++ b/include/llvm/IR/OperandTraits.h
@@ -30,6 +30,9 @@ namespace llvm {
 template <typename SubClass, unsigned ARITY>
 struct FixedNumOperandTraits {
   static Use *op_begin(SubClass* U) {
+    static_assert(
+        !std::is_polymorphic<SubClass>::value,
+        "adding virtual methods to subclasses of User breaks use lists");
     return reinterpret_cast<Use*>(U) - ARITY;
   }
   static Use *op_end(SubClass* U) {
@@ -65,6 +68,9 @@ struct OptionalOperandTraits : public FixedNumOperandTraits<SubClass, ARITY> {
 template <typename SubClass, unsigned MINARITY = 0>
 struct VariadicOperandTraits {
   static Use *op_begin(SubClass* U) {
+    static_assert(
+        !std::is_polymorphic<SubClass>::value,
+        "adding virtual methods to subclasses of User breaks use lists");
     return reinterpret_cast<Use*>(U) - static_cast<User*>(U)->getNumOperands();
   }
   static Use *op_end(SubClass* U) {
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 997a85340c25..49fa6a6a877a 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -29,16 +29,11 @@ namespace llvm {
 /// This is a utility class that provides an abstraction for the common
 /// functionality between Instructions and ConstantExprs.
 class Operator : public User {
-protected:
-  // NOTE: Cannot use = delete because it's not legal to delete
-  // an overridden method that's not deleted in the base class. Cannot leave
-  // this unimplemented because that leads to an ODR-violation.
-  ~Operator() override;
-
 public:
   // The Operator class is intended to be used as a utility, and is never itself
   // instantiated.
   Operator() = delete;
+  ~Operator() = delete;
 
   void *operator new(size_t, unsigned) = delete;
   void *operator new(size_t s) = delete;
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 6b2b22e82b95..072c6c5ece83 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -886,17 +886,21 @@ template <typename LHS_t> struct not_match {
 
   template <typename OpTy> bool match(OpTy *V) {
     if (auto *O = dyn_cast<Operator>(V))
-      if (O->getOpcode() == Instruction::Xor)
-        return matchIfNot(O->getOperand(0), O->getOperand(1));
+      if (O->getOpcode() == Instruction::Xor) {
+        if (isAllOnes(O->getOperand(1)))
+          return L.match(O->getOperand(0));
+        if (isAllOnes(O->getOperand(0)))
+          return L.match(O->getOperand(1));
+      }
     return false;
   }
 
 private:
-  bool matchIfNot(Value *LHS, Value *RHS) {
-    return (isa<ConstantInt>(RHS) || isa<ConstantDataVector>(RHS) ||
+  bool isAllOnes(Value *V) {
+    return (isa<ConstantInt>(V) || isa<ConstantDataVector>(V) ||
             // FIXME: Remove CV.
-            isa<ConstantVector>(RHS)) &&
-           cast<Constant>(RHS)->isAllOnesValue() && L.match(LHS);
+            isa<ConstantVector>(V)) &&
+           cast<Constant>(V)->isAllOnesValue();
   }
 };
 
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 7b9d451aaf53..109a3d5e7be8 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -46,8 +46,6 @@ class User : public Value {
   template <unsigned>
   friend struct HungoffOperandTraits;
 
-  virtual void anchor();
-
   LLVM_ATTRIBUTE_ALWAYS_INLINE inline static void *
   allocateFixedOperandUser(size_t, unsigned, unsigned);
 
@@ -93,9 +91,11 @@ protected:
   /// should be called if there are no uses.
   void growHungoffUses(unsigned N, bool IsPhi = false);
 
+protected:
+  ~User() = default; // Use deleteValue() to delete a generic Instruction.
+
 public:
   User(const User &) = delete;
-  ~User() override = default;
 
   /// \brief Free memory allocated for User and Use objects.
   void operator delete(void *Usr);
diff --git a/include/llvm/IR/Value.def b/include/llvm/IR/Value.def
index 48842d7f9cd5..cebd7f7297ef 100644
--- a/include/llvm/IR/Value.def
+++ b/include/llvm/IR/Value.def
@@ -20,10 +20,14 @@
 #if !(defined HANDLE_GLOBAL_VALUE || defined HANDLE_CONSTANT ||                \
       defined HANDLE_INSTRUCTION || defined HANDLE_INLINE_ASM_VALUE ||         \
       defined HANDLE_METADATA_VALUE || defined HANDLE_VALUE ||                 \
-      defined HANDLE_CONSTANT_MARKER)
+      defined HANDLE_CONSTANT_MARKER || defined HANDLE_MEMORY_VALUE)
 #error "Missing macro definition of HANDLE_VALUE*"
 #endif
 
+#ifndef HANDLE_MEMORY_VALUE
+#define HANDLE_MEMORY_VALUE(ValueName) HANDLE_VALUE(ValueName)
+#endif
+
 #ifndef HANDLE_GLOBAL_VALUE
 #define HANDLE_GLOBAL_VALUE(ValueName) HANDLE_CONSTANT(ValueName)
 #endif
@@ -54,9 +58,13 @@
 
 HANDLE_VALUE(Argument)
 HANDLE_VALUE(BasicBlock)
-HANDLE_VALUE(MemoryUse)
-HANDLE_VALUE(MemoryDef)
-HANDLE_VALUE(MemoryPhi)
+
+// FIXME: It's awkward that Value.def knows about classes in Analysis. While
+// this doesn't introduce a strict link or include dependency, we should remove
+// the circular dependency eventually.
+HANDLE_MEMORY_VALUE(MemoryUse)
+HANDLE_MEMORY_VALUE(MemoryDef)
+HANDLE_MEMORY_VALUE(MemoryPhi)
 
 HANDLE_GLOBAL_VALUE(Function)
 HANDLE_GLOBAL_VALUE(GlobalAlias)
@@ -94,6 +102,7 @@ HANDLE_CONSTANT_MARKER(ConstantDataLastVal, ConstantTokenNone)
 HANDLE_CONSTANT_MARKER(ConstantAggregateFirstVal, ConstantArray)
 HANDLE_CONSTANT_MARKER(ConstantAggregateLastVal, ConstantVector)
 
+#undef HANDLE_MEMORY_VALUE
 #undef HANDLE_GLOBAL_VALUE
 #undef HANDLE_CONSTANT
 #undef HANDLE_INSTRUCTION
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 96a370dcc35f..d669b1544070 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -21,6 +21,7 @@
 #include "llvm-c/Types.h"
 #include <cassert>
 #include <iterator>
+#include <memory>
 
 namespace llvm {
 
@@ -69,6 +70,8 @@ using ValueName = StringMapEntry<Value*>;
 /// objects that watch it and listen to RAUW and Destroy events.  See
 /// llvm/IR/ValueHandle.h for details.
 class Value {
+  // The least-significant bit of the first word of Value *must* be zero:
+  //   http://www.llvm.org/docs/ProgrammersManual.html#the-waymarking-algorithm
   Type *VTy;
   Use *UseList;
 
@@ -200,10 +203,19 @@ private:
 protected:
   Value(Type *Ty, unsigned scid);
 
+  /// Value's destructor should be virtual by design, but that would require
+  /// that Value and all of its subclasses have a vtable that effectively
+  /// duplicates the information in the value ID. As a size optimization, the
+  /// destructor has been protected, and the caller should manually call
+  /// deleteValue.
+  ~Value(); // Use deleteValue() to delete a generic Value.
+
 public:
   Value(const Value &) = delete;
   void operator=(const Value &) = delete;
-  virtual ~Value();
+
+  /// Delete a pointer to a generic Value.
+  void deleteValue();
 
   /// \brief Support for debugging, callable in GDB: V->dump()
   void dump() const;
@@ -643,6 +655,13 @@ protected:
   void setValueSubclassData(unsigned short D) { SubclassData = D; }
 };
 
+struct ValueDeleter { void operator()(Value *V) { V->deleteValue(); } };
+
+/// Use this instead of std::unique_ptr<Value> or std::unique_ptr<Instruction>.
+/// Those don't work because Value and Instruction's destructors are protected,
+/// aren't virtual, and won't destroy the complete object.
+typedef std::unique_ptr<Value, ValueDeleter> unique_value;
+
 inline raw_ostream &operator<<(raw_ostream &OS, const Value &V) {
   V.print(OS);
   return OS;
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index cf314e19d1ca..3df5244a0bd6 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -187,7 +187,6 @@ void initializeLintPass(PassRegistry&);
 void initializeLiveDebugValuesPass(PassRegistry&);
 void initializeLiveDebugVariablesPass(PassRegistry&);
 void initializeLiveIntervalsPass(PassRegistry&);
-void initializeLiveRangeShrinkPass(PassRegistry&);
 void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index f42048e48ee3..cf5d93ee9ed7 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -57,6 +57,8 @@ protected:
     ID_MachO64L, // MachO 64-bit, little endian
     ID_MachO64B, // MachO 64-bit, big endian
 
+    ID_WinRes, // Windows resource (.res) file.
+
     ID_Wasm,
 
     ID_EndObjects
@@ -132,6 +134,8 @@ public:
              TypeID == ID_MachO32B || TypeID == ID_MachO64B);
   }
 
+  bool isWinRes() const { return TypeID == ID_WinRes; }
+
   Triple::ObjectFormatType getTripleObjectFormat() const {
     if (isCOFF())
       return Triple::COFF;
diff --git a/include/llvm/Object/COFFImportFile.h b/include/llvm/Object/COFFImportFile.h
index 78d9d679acd3..78044a2832fa 100644
--- a/include/llvm/Object/COFFImportFile.h
+++ b/include/llvm/Object/COFFImportFile.h
@@ -9,13 +9,15 @@
 //
 // COFF short import file is a special kind of file which contains
 // only symbol names for DLL-exported symbols. This class implements
-// SymbolicFile interface for the file.
+// exporting of Symbols to create libraries and a SymbolicFile
+// interface for the file type.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_OBJECT_COFF_IMPORT_FILE_H
 #define LLVM_OBJECT_COFF_IMPORT_FILE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
@@ -68,6 +70,36 @@ private:
   }
 };
 
+struct COFFShortExport {
+  std::string Name;
+  std::string ExtName;
+
+  uint16_t Ordinal = 0;
+  bool Noname = false;
+  bool Data = false;
+  bool Private = false;
+  bool Constant = false;
+
+  bool isWeak() {
+    return ExtName.size() && ExtName != Name;
+  }
+
+  friend bool operator==(const COFFShortExport &L, const COFFShortExport &R) {
+    return L.Name == R.Name && L.ExtName == R.ExtName &&
+            L.Ordinal == R.Ordinal && L.Noname == R.Noname &&
+            L.Data == R.Data && L.Private == R.Private;
+  }
+
+  friend bool operator!=(const COFFShortExport &L, const COFFShortExport &R) {
+    return !(L == R);
+  }
+};
+
+std::error_code writeImportLibrary(StringRef DLLName,
+                                   StringRef Path,
+                                   ArrayRef<COFFShortExport> Exports,
+                                   COFF::MachineTypes Machine);
+
 } // namespace object
 } // namespace llvm
 
diff --git a/include/llvm/Object/COFFModuleDefinition.h b/include/llvm/Object/COFFModuleDefinition.h
new file mode 100644
index 000000000000..0428283fdc88
--- /dev/null
+++ b/include/llvm/Object/COFFModuleDefinition.h
@@ -0,0 +1,49 @@
+//===--- COFFModuleDefinition.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Windows-specific.
+// A parser for the module-definition file (.def file).
+// Parsed results are directly written to Config global variable.
+//
+// The format of module-definition files are described in this document:
+// https://msdn.microsoft.com/en-us/library/28d6s79h.aspx
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_OBJECT_COFF_MODULE_DEFINITION_H
+#define LLVM_OBJECT_COFF_MODULE_DEFINITION_H
+
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/COFF.h"
+
+namespace llvm {
+namespace object {
+
+struct COFFModuleDefinition {
+  std::vector<COFFShortExport> Exports;
+  std::string OutputFile;
+  uint64_t ImageBase = 0;
+  uint64_t StackReserve = 0;
+  uint64_t StackCommit = 0;
+  uint64_t HeapReserve = 0;
+  uint64_t HeapCommit = 0;
+  uint32_t MajorImageVersion = 0;
+  uint32_t MinorImageVersion = 0;
+  uint32_t MajorOSVersion = 0;
+  uint32_t MinorOSVersion = 0;
+};
+
+Expected<COFFModuleDefinition>
+parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine);
+
+} // End namespace object.
+} // End namespace llvm.
+
+#endif
diff --git a/include/llvm/Object/Decompressor.h b/include/llvm/Object/Decompressor.h
index a11857d546aa..0f63f8b821b7 100644
--- a/include/llvm/Object/Decompressor.h
+++ b/include/llvm/Object/Decompressor.h
@@ -30,7 +30,10 @@ public:
 
   /// @brief Resize the buffer and uncompress section data into it.
   /// @param Out         Destination buffer.
-  Error decompress(SmallString<32> &Out);
+  template <class T> Error resizeAndDecompress(T &Out) {
+    Out.resize(DecompressedSize);
+    return decompress({Out.data(), (size_t)DecompressedSize});
+  }
 
   /// @brief Uncompress section data to raw buffer provided.
   /// @param Buffer      Destination buffer.
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 42fdfe3e5a74..a4d431b6cbe7 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -235,10 +235,7 @@ ELFFile<ELFT>::getSection(const Elf_Sym *Sym, Elf_Sym_Range Symbols,
   uint32_t Index = *IndexOrErr;
   if (Index == 0)
     return nullptr;
-  auto SectionsOrErr = sections();
-  if (!SectionsOrErr)
-    return SectionsOrErr.takeError();
-  return object::getSection<ELFT>(*SectionsOrErr, Index);
+  return getSection(Index);
 }
 
 template <class ELFT>
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 73c7ce367cb0..86579b7c3e3a 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -32,18 +32,6 @@
 namespace llvm {
 namespace object {
 
-struct RelocToApply {
-  // The computed value after applying the relevant relocations.
-  int64_t Value = 0;
-
-  // The width of the value; how many bytes to touch when applying the
-  // relocation.
-  char Width = 0;
-
-  RelocToApply() = default;
-  RelocToApply(int64_t Value, char Width) : Value(Value), Width(Width) {}
-};
-
 /// @brief Base class for object file relocation visitors.
 class RelocVisitor {
 public:
@@ -52,7 +40,7 @@ public:
   // TODO: Should handle multiple applied relocations via either passing in the
   // previously computed value or just count paired relocations as a single
   // visit.
-  RelocToApply visit(uint32_t RelocType, RelocationRef R, uint64_t Value = 0) {
+  uint64_t visit(uint32_t RelocType, RelocationRef R, uint64_t Value = 0) {
     if (isa<ELFObjectFileBase>(ObjToVisit))
       return visitELF(RelocType, R, Value);
     if (isa<COFFObjectFile>(ObjToVisit))
@@ -61,7 +49,7 @@ public:
       return visitMachO(RelocType, R, Value);
 
     HasError = true;
-    return RelocToApply();
+    return 0;
   }
 
   bool error() { return HasError; }
@@ -70,7 +58,7 @@ private:
   const ObjectFile &ObjToVisit;
   bool HasError = false;
 
-  RelocToApply visitELF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+  uint64_t visitELF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
     if (ObjToVisit.getBytesInAddress() == 8) { // 64-bit object file
       switch (ObjToVisit.getArch()) {
       case Triple::x86_64:
@@ -87,7 +75,7 @@ private:
           return visitELF_X86_64_32S(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::aarch64:
       case Triple::aarch64_be:
@@ -98,7 +86,7 @@ private:
           return visitELF_AARCH64_ABS64(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::bpfel:
       case Triple::bpfeb:
@@ -109,7 +97,7 @@ private:
           return visitELF_BPF_64_32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::mips64el:
       case Triple::mips64:
@@ -120,7 +108,7 @@ private:
           return visitELF_MIPS64_64(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::ppc64le:
       case Triple::ppc64:
@@ -131,7 +119,7 @@ private:
           return visitELF_PPC64_ADDR64(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::systemz:
         switch (RelocType) {
@@ -141,7 +129,7 @@ private:
           return visitELF_390_64(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::sparcv9:
         switch (RelocType) {
@@ -153,7 +141,7 @@ private:
           return visitELF_SPARCV9_64(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::amdgcn:
         switch (RelocType) {
@@ -163,11 +151,11 @@ private:
           return visitELF_AMDGPU_ABS64(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       default:
         HasError = true;
-        return RelocToApply();
+        return 0;
       }
     } else if (ObjToVisit.getBytesInAddress() == 4) { // 32-bit object file
       switch (ObjToVisit.getArch()) {
@@ -181,7 +169,7 @@ private:
           return visitELF_386_PC32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::ppc:
         switch (RelocType) {
@@ -189,14 +177,14 @@ private:
           return visitELF_PPC_ADDR32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::arm:
       case Triple::armeb:
         switch (RelocType) {
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         case ELF::R_ARM_ABS32:
           return visitELF_ARM_ABS32(R, Value);
         }
@@ -206,7 +194,7 @@ private:
           return visitELF_Lanai_32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::mipsel:
       case Triple::mips:
@@ -215,7 +203,7 @@ private:
           return visitELF_MIPS_32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::sparc:
         switch (RelocType) {
@@ -224,7 +212,7 @@ private:
           return visitELF_SPARC_32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       case Triple::hexagon:
         switch (RelocType) {
@@ -232,18 +220,18 @@ private:
           return visitELF_HEX_32(R, Value);
         default:
           HasError = true;
-          return RelocToApply();
+          return 0;
         }
       default:
         HasError = true;
-        return RelocToApply();
+        return 0;
       }
     } else {
       report_fatal_error("Invalid word size in object file");
     }
   }
 
-  RelocToApply visitCOFF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+  uint64_t visitCOFF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
     switch (ObjToVisit.getArch()) {
     case Triple::x86:
       switch (RelocType) {
@@ -263,10 +251,10 @@ private:
       break;
     }
     HasError = true;
-    return RelocToApply();
+    return 0;
   }
 
-  RelocToApply visitMachO(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+  uint64_t visitMachO(uint32_t RelocType, RelocationRef R, uint64_t Value) {
     switch (ObjToVisit.getArch()) {
     default: break;
     case Triple::x86_64:
@@ -277,7 +265,7 @@ private:
       }
     }
     HasError = true;
-    return RelocToApply();
+    return 0;
   }
 
   int64_t getELFAddend(RelocationRef R) {
@@ -287,108 +275,88 @@ private:
     return *AddendOrErr;
   }
 
-  uint8_t getLengthMachO64(RelocationRef R) {
-    const MachOObjectFile *Obj = cast<MachOObjectFile>(R.getObject());
-    return Obj->getRelocationLength(R.getRawDataRefImpl());
-  }
-
   /// Operations
 
   /// 386-ELF
-  RelocToApply visitELF_386_NONE(RelocationRef R) {
-    return RelocToApply(0, 0);
+  uint64_t visitELF_386_NONE(RelocationRef R) {
+    return 0;
   }
 
   // Ideally the Addend here will be the addend in the data for
   // the relocation. It's not actually the case for Rel relocations.
-  RelocToApply visitELF_386_32(RelocationRef R, uint64_t Value) {
-    return RelocToApply(Value, 4);
+  uint64_t visitELF_386_32(RelocationRef R, uint64_t Value) {
+    return Value;
   }
 
-  RelocToApply visitELF_386_PC32(RelocationRef R, uint64_t Value) {
-    uint64_t Address = R.getOffset();
-    return RelocToApply(Value - Address, 4);
+  uint64_t visitELF_386_PC32(RelocationRef R, uint64_t Value) {
+    return Value - R.getOffset();
   }
 
   /// X86-64 ELF
-  RelocToApply visitELF_X86_64_NONE(RelocationRef R) {
-    return RelocToApply(0, 0);
+  uint64_t visitELF_X86_64_NONE(RelocationRef R) {
+    return 0;
   }
-  RelocToApply visitELF_X86_64_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 8);
+
+  uint64_t visitELF_X86_64_64(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
-  RelocToApply visitELF_X86_64_PC32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint64_t Address = R.getOffset();
-    return RelocToApply(Value + Addend - Address, 4);
+
+  uint64_t visitELF_X86_64_PC32(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R) - R.getOffset();
   }
-  RelocToApply visitELF_X86_64_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+
+  uint64_t visitELF_X86_64_32(RelocationRef R, uint64_t Value) {
+    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
   }
-  RelocToApply visitELF_X86_64_32S(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    int32_t Res = (Value + Addend) & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+
+  uint64_t visitELF_X86_64_32S(RelocationRef R, uint64_t Value) {
+    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
   }
 
   /// BPF ELF
-  RelocToApply visitELF_BPF_64_32(RelocationRef R, uint64_t Value) {
-    uint32_t Res = Value & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+  uint64_t visitELF_BPF_64_32(RelocationRef R, uint64_t Value) {
+    return Value & 0xFFFFFFFF;
   }
-  RelocToApply visitELF_BPF_64_64(RelocationRef R, uint64_t Value) {
-    return RelocToApply(Value, 8);
+
+  uint64_t visitELF_BPF_64_64(RelocationRef R, uint64_t Value) {
+    return Value;
   }
 
   /// PPC64 ELF
-  RelocToApply visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+  uint64_t visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) {
+    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
   }
-  RelocToApply visitELF_PPC64_ADDR64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 8);
+
+  uint64_t visitELF_PPC64_ADDR64(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
 
   /// PPC32 ELF
-  RelocToApply visitELF_PPC_ADDR32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+  uint64_t visitELF_PPC_ADDR32(RelocationRef R, uint64_t Value) {
+    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
   }
 
   /// Lanai ELF
-  RelocToApply visitELF_Lanai_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+  uint64_t visitELF_Lanai_32(RelocationRef R, uint64_t Value) {
+    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
   }
 
   /// MIPS ELF
-  RelocToApply visitELF_MIPS_32(RelocationRef R, uint64_t Value) {
-    uint32_t Res = Value & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+  uint64_t visitELF_MIPS_32(RelocationRef R, uint64_t Value) {
+    return Value & 0xFFFFFFFF;
   }
 
   /// MIPS64 ELF
-  RelocToApply visitELF_MIPS64_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
-    return RelocToApply(Res, 4);
+  uint64_t visitELF_MIPS64_32(RelocationRef R, uint64_t Value) {
+    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
   }
 
-  RelocToApply visitELF_MIPS64_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    uint64_t Res = (Value + Addend);
-    return RelocToApply(Res, 8);
+  uint64_t visitELF_MIPS64_64(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
 
   // AArch64 ELF
-  RelocToApply visitELF_AARCH64_ABS32(RelocationRef R, uint64_t Value) {
+  uint64_t visitELF_AARCH64_ABS32(RelocationRef R, uint64_t Value) {
     int64_t Addend = getELFAddend(R);
     int64_t Res =  Value + Addend;
 
@@ -396,16 +364,15 @@ private:
     if (Res < INT32_MIN || Res > UINT32_MAX)
       HasError = true;
 
-    return RelocToApply(static_cast<uint32_t>(Res), 4);
+    return static_cast<uint32_t>(Res);
   }
 
-  RelocToApply visitELF_AARCH64_ABS64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 8);
+  uint64_t visitELF_AARCH64_ABS64(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
 
   // SystemZ ELF
-  RelocToApply visitELF_390_32(RelocationRef R, uint64_t Value) {
+  uint64_t visitELF_390_32(RelocationRef R, uint64_t Value) {
     int64_t Addend = getELFAddend(R);
     int64_t Res = Value + Addend;
 
@@ -413,77 +380,71 @@ private:
     if (Res < INT32_MIN || Res > UINT32_MAX)
       HasError = true;
 
-    return RelocToApply(static_cast<uint32_t>(Res), 4);
+    return static_cast<uint32_t>(Res);
   }
 
-  RelocToApply visitELF_390_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 8);
+  uint64_t visitELF_390_64(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
 
-  RelocToApply visitELF_SPARC_32(RelocationRef R, uint32_t Value) {
-    int32_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 4);
+  uint64_t visitELF_SPARC_32(RelocationRef R, uint32_t Value) {
+    return Value + getELFAddend(R);
   }
 
-  RelocToApply visitELF_SPARCV9_32(RelocationRef R, uint64_t Value) {
-    int32_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 4);
+  uint64_t visitELF_SPARCV9_32(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
 
-  RelocToApply visitELF_SPARCV9_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 8);
+  uint64_t visitELF_SPARCV9_64(RelocationRef R, uint64_t Value) {
+    return Value + getELFAddend(R);
   }
 
-  RelocToApply visitELF_ARM_ABS32(RelocationRef R, uint64_t Value) {
+  uint64_t visitELF_ARM_ABS32(RelocationRef R, uint64_t Value) {
     int64_t Res = Value;
 
     // Overflow check allows for both signed and unsigned interpretation.
     if (Res < INT32_MIN || Res > UINT32_MAX)
       HasError = true;
 
-    return RelocToApply(static_cast<uint32_t>(Res), 4);
+    return static_cast<uint32_t>(Res);
   }
 
-  RelocToApply visitELF_HEX_32(RelocationRef R, uint64_t Value) {
+  uint64_t visitELF_HEX_32(RelocationRef R, uint64_t Value) {
     int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 4);
+    return Value + Addend;
   }
 
-  RelocToApply visitELF_AMDGPU_ABS32(RelocationRef R, uint64_t Value) {
+  uint64_t visitELF_AMDGPU_ABS32(RelocationRef R, uint64_t Value) {
     int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 4);
+    return Value + Addend;
   }
 
-  RelocToApply visitELF_AMDGPU_ABS64(RelocationRef R, uint64_t Value) {
+  uint64_t visitELF_AMDGPU_ABS64(RelocationRef R, uint64_t Value) {
     int64_t Addend = getELFAddend(R);
-    return RelocToApply(Value + Addend, 8);
+    return Value + Addend;
   }
 
   /// I386 COFF
-  RelocToApply visitCOFF_I386_SECREL(RelocationRef R, uint64_t Value) {
-    return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
+  uint64_t visitCOFF_I386_SECREL(RelocationRef R, uint64_t Value) {
+    return static_cast<uint32_t>(Value);
   }
 
-  RelocToApply visitCOFF_I386_DIR32(RelocationRef R, uint64_t Value) {
-    return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
+  uint64_t visitCOFF_I386_DIR32(RelocationRef R, uint64_t Value) {
+    return static_cast<uint32_t>(Value);
   }
 
   /// AMD64 COFF
-  RelocToApply visitCOFF_AMD64_SECREL(RelocationRef R, uint64_t Value) {
-    return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
+  uint64_t visitCOFF_AMD64_SECREL(RelocationRef R, uint64_t Value) {
+    return static_cast<uint32_t>(Value);
   }
 
-  RelocToApply visitCOFF_AMD64_ADDR64(RelocationRef R, uint64_t Value) {
-    return RelocToApply(Value, /*Width=*/8);
+  uint64_t visitCOFF_AMD64_ADDR64(RelocationRef R, uint64_t Value) {
+    return Value;
   }
 
   // X86_64 MachO
-  RelocToApply visitMACHO_X86_64_UNSIGNED(RelocationRef R, uint64_t Value) {
-    uint8_t Length = getLengthMachO64(R);
-    Length = 1<<Length;
-    return RelocToApply(Value, Length);
+  uint64_t visitMACHO_X86_64_UNSIGNED(RelocationRef R, uint64_t Value) {
+    return Value;
   }
 };
 
diff --git a/include/llvm/Object/WindowsResource.h b/include/llvm/Object/WindowsResource.h
new file mode 100644
index 000000000000..f94ad09ce0c6
--- /dev/null
+++ b/include/llvm/Object/WindowsResource.h
@@ -0,0 +1,82 @@
+//===-- WindowsResource.h ---------------------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file declares the .res file class.  .res files are intermediate
+// products of the typical resource-compilation process on Windows.  This
+// process is as follows:
+//
+// .rc file(s) ---(rc.exe)---> .res file(s) ---(cvtres.exe)---> COFF file
+//
+// .rc files are human-readable scripts that list all resources a program uses.
+//
+// They are compiled into .res files, which are a list of the resources in
+// binary form.
+//
+// Finally the data stored in the .res is compiled into a COFF file, where it
+// is organized in a directory tree structure for optimized access by the
+// program during runtime.
+//
+// Ref: msdn.microsoft.com/en-us/library/windows/desktop/ms648007(v=vs.85).aspx
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_INCLUDE_LLVM_OBJECT_RESFILE_H
+#define LLVM_INCLUDE_LLVM_OBJECT_RESFILE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace object {
+
+class WindowsResource;
+
+class ResourceEntryRef {
+public:
+  Error moveNext(bool &End);
+
+private:
+  friend class WindowsResource;
+
+  ResourceEntryRef(BinaryStreamRef Ref, const WindowsResource *Owner,
+                   Error &Err);
+  Error loadNext();
+
+  BinaryStreamReader Reader;
+  BinaryStreamRef HeaderBytes;
+  BinaryStreamRef DataBytes;
+  const WindowsResource *OwningRes = nullptr;
+};
+
+class WindowsResource : public Binary {
+public:
+  ~WindowsResource() override;
+  Expected<ResourceEntryRef> getHeadEntry();
+
+  static bool classof(const Binary *V) { return V->isWinRes(); }
+
+  static Expected<std::unique_ptr<WindowsResource>>
+  createWindowsResource(MemoryBufferRef Source);
+
+private:
+  friend class ResourceEntryRef;
+
+  WindowsResource(MemoryBufferRef Source);
+
+  BinaryByteStream BBS;
+};
+
+} // namespace object
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/PassInfo.h b/include/llvm/PassInfo.h
index 21ade85b682f..81dface3c9a0 100644
--- a/include/llvm/PassInfo.h
+++ b/include/llvm/PassInfo.h
@@ -32,7 +32,6 @@ class TargetMachine;
 class PassInfo {
 public:
   typedef Pass* (*NormalCtor_t)();
-  typedef Pass *(*TargetMachineCtor_t)(TargetMachine *);
 
 private:
   StringRef PassName;     // Nice name for Pass
@@ -44,24 +43,20 @@ private:
   std::vector<const PassInfo *> ItfImpl; // Interfaces implemented by this pass
 
   NormalCtor_t NormalCtor;
-  TargetMachineCtor_t TargetMachineCtor;
 
 public:
   /// PassInfo ctor - Do not call this directly, this should only be invoked
   /// through RegisterPass.
   PassInfo(StringRef name, StringRef arg, const void *pi, NormalCtor_t normal,
-           bool isCFGOnly, bool is_analysis,
-           TargetMachineCtor_t machine = nullptr)
+           bool isCFGOnly, bool is_analysis)
       : PassName(name), PassArgument(arg), PassID(pi), IsCFGOnlyPass(isCFGOnly),
-        IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal),
-        TargetMachineCtor(machine) {}
+        IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal) {}
   /// PassInfo ctor - Do not call this directly, this should only be invoked
   /// through RegisterPass. This version is for use by analysis groups; it
   /// does not auto-register the pass.
   PassInfo(StringRef name, const void *pi)
       : PassName(name), PassArgument(""), PassID(pi), IsCFGOnlyPass(false),
-        IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr),
-        TargetMachineCtor(nullptr) {}
+        IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr) {}
 
   /// getPassName - Return the friendly name for the pass, never returns null
   ///
@@ -101,16 +96,6 @@ public:
     NormalCtor = Ctor;
   }
 
-  /// getTargetMachineCtor - Return a pointer to a function, that when called
-  /// with a TargetMachine, creates an instance of the pass and returns it.
-  /// This pointer may be null if there is no constructor with a TargetMachine
-  /// for the pass.
-  ///
-  TargetMachineCtor_t getTargetMachineCtor() const { return TargetMachineCtor; }
-  void setTargetMachineCtor(TargetMachineCtor_t Ctor) {
-    TargetMachineCtor = Ctor;
-  }
-
   /// createPass() - Use this method to create an instance of this pass.
   Pass *createPass() const {
     assert((!isAnalysisGroup() || NormalCtor) &&
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index 50e6b498fb46..602f45ac5178 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -31,8 +31,6 @@
 
 namespace llvm {
 
-class TargetMachine;
-
 #define INITIALIZE_PASS(passName, arg, name, cfg, analysis)                    \
   static void *initialize##passName##PassOnce(PassRegistry &Registry) {        \
     PassInfo *PI = new PassInfo(                                               \
@@ -78,10 +76,6 @@ class TargetMachine;
 
 template <typename PassName> Pass *callDefaultCtor() { return new PassName(); }
 
-template <typename PassName> Pass *callTargetMachineCtor(TargetMachine *TM) {
-  return new PassName(TM);
-}
-
 //===---------------------------------------------------------------------------
 /// RegisterPass<t> template - This template class is used to notify the system
 /// that a Pass is available for use, and registers it into the internal
diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h
index 77738077f5ff..56375f41d2c0 100644
--- a/include/llvm/Support/BinaryStreamReader.h
+++ b/include/llvm/Support/BinaryStreamReader.h
@@ -16,7 +16,6 @@
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/type_traits.h"
 
 #include <string>
@@ -32,7 +31,21 @@ namespace llvm {
 class BinaryStreamReader {
 public:
   BinaryStreamReader() = default;
-  explicit BinaryStreamReader(BinaryStreamRef Stream);
+  explicit BinaryStreamReader(BinaryStreamRef Ref);
+  explicit BinaryStreamReader(BinaryStream &Stream);
+  explicit BinaryStreamReader(ArrayRef<uint8_t> Data,
+                              llvm::support::endianness Endian);
+  explicit BinaryStreamReader(StringRef Data, llvm::support::endianness Endian);
+
+  BinaryStreamReader(const BinaryStreamReader &Other)
+      : Stream(Other.Stream), Offset(Other.Offset) {}
+
+  BinaryStreamReader &operator=(const BinaryStreamReader &Other) {
+    Stream = Other.Stream;
+    Offset = Other.Offset;
+    return *this;
+  }
+
   virtual ~BinaryStreamReader() {}
 
   /// Read as much as possible from the underlying string at the current offset
@@ -244,12 +257,14 @@ public:
   /// \returns the next byte in the stream.
   uint8_t peek() const;
 
+  Error padToAlignment(uint32_t Align);
+
   std::pair<BinaryStreamReader, BinaryStreamReader>
   split(uint32_t Offset) const;
 
 private:
   BinaryStreamRef Stream;
-  uint32_t Offset;
+  uint32_t Offset = 0;
 };
 } // namespace llvm
 
diff --git a/include/llvm/Support/BinaryStreamRef.h b/include/llvm/Support/BinaryStreamRef.h
index 465e724a6886..e3bd4bf0860e 100644
--- a/include/llvm/Support/BinaryStreamRef.h
+++ b/include/llvm/Support/BinaryStreamRef.h
@@ -16,36 +16,74 @@
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
+#include <memory>
 
 namespace llvm {
 
 /// Common stuff for mutable and immutable StreamRefs.
-template <class StreamType, class RefType> class BinaryStreamRefBase {
-public:
-  BinaryStreamRefBase() : Stream(nullptr), ViewOffset(0), Length(0) {}
-  BinaryStreamRefBase(StreamType &Stream, uint32_t Offset, uint32_t Length)
-      : Stream(&Stream), ViewOffset(Offset), Length(Length) {}
+template <class RefType, class StreamType> class BinaryStreamRefBase {
+protected:
+  BinaryStreamRefBase() = default;
+  BinaryStreamRefBase(std::shared_ptr<StreamType> SharedImpl, uint32_t Offset,
+                      uint32_t Length)
+      : SharedImpl(SharedImpl), BorrowedImpl(SharedImpl.get()),
+        ViewOffset(Offset), Length(Length) {}
+  BinaryStreamRefBase(StreamType &BorrowedImpl, uint32_t Offset,
+                      uint32_t Length)
+      : BorrowedImpl(&BorrowedImpl), ViewOffset(Offset), Length(Length) {}
+  BinaryStreamRefBase(const BinaryStreamRefBase &Other) {
+    SharedImpl = Other.SharedImpl;
+    BorrowedImpl = Other.BorrowedImpl;
+    ViewOffset = Other.ViewOffset;
+    Length = Other.Length;
+  }
 
-  llvm::support::endianness getEndian() const { return Stream->getEndian(); }
+public:
+  llvm::support::endianness getEndian() const {
+    return BorrowedImpl->getEndian();
+  }
 
   uint32_t getLength() const { return Length; }
-  const StreamType *getStream() const { return Stream; }
 
   /// Return a new BinaryStreamRef with the first \p N elements removed.
   RefType drop_front(uint32_t N) const {
-    if (!Stream)
+    if (!BorrowedImpl)
       return RefType();
 
     N = std::min(N, Length);
-    return RefType(*Stream, ViewOffset + N, Length - N);
+    RefType Result(static_cast<const RefType &>(*this));
+    Result.ViewOffset += N;
+    Result.Length -= N;
+    return Result;
   }
 
-  /// Return a new BinaryStreamRef with only the first \p N elements remaining.
-  RefType keep_front(uint32_t N) const {
-    if (!Stream)
+  /// Return a new BinaryStreamRef with the first \p N elements removed.
+  RefType drop_back(uint32_t N) const {
+    if (!BorrowedImpl)
       return RefType();
+
     N = std::min(N, Length);
-    return RefType(*Stream, ViewOffset, N);
+    RefType Result(static_cast<const RefType &>(*this));
+    Result.Length -= N;
+    return Result;
+  }
+
+  /// Return a new BinaryStreamRef with only the first \p N elements remaining.
+  RefType keep_front(uint32_t N) const {
+    assert(N <= getLength());
+    return drop_back(getLength() - N);
+  }
+
+  /// Return a new BinaryStreamRef with only the last \p N elements remaining.
+  RefType keep_back(uint32_t N) const {
+    assert(N <= getLength());
+    return drop_front(getLength() - N);
+  }
+
+  /// Return a new BinaryStreamRef with the first and last \p N elements
+  /// removed.
+  RefType drop_symmetric(uint32_t N) const {
+    return drop_front(N).drop_back(N);
   }
 
   /// Return a new BinaryStreamRef with the first \p Offset elements removed,
@@ -54,8 +92,10 @@ public:
     return drop_front(Offset).keep_front(Len);
   }
 
+  bool valid() const { return BorrowedImpl != nullptr; }
+
   bool operator==(const RefType &Other) const {
-    if (Stream != Other.Stream)
+    if (BorrowedImpl != Other.BorrowedImpl)
       return false;
     if (ViewOffset != Other.ViewOffset)
       return false;
@@ -73,9 +113,10 @@ protected:
     return Error::success();
   }
 
-  StreamType *Stream;
-  uint32_t ViewOffset;
-  uint32_t Length;
+  std::shared_ptr<StreamType> SharedImpl;
+  StreamType *BorrowedImpl = nullptr;
+  uint32_t ViewOffset = 0;
+  uint32_t Length = 0;
 };
 
 /// \brief BinaryStreamRef is to BinaryStream what ArrayRef is to an Array.  It
@@ -86,21 +127,27 @@ protected:
 /// and use inheritance to achieve polymorphism.  Instead, you should pass
 /// around BinaryStreamRefs by value and achieve polymorphism that way.
 class BinaryStreamRef
-    : public BinaryStreamRefBase<BinaryStream, BinaryStreamRef> {
+    : public BinaryStreamRefBase<BinaryStreamRef, BinaryStream> {
+  friend BinaryStreamRefBase<BinaryStreamRef, BinaryStream>;
+  friend class WritableBinaryStreamRef;
+  BinaryStreamRef(std::shared_ptr<BinaryStream> Impl, uint32_t ViewOffset,
+                  uint32_t Length)
+      : BinaryStreamRefBase(Impl, ViewOffset, Length) {}
+
 public:
   BinaryStreamRef() = default;
-  BinaryStreamRef(BinaryStream &Stream)
-      : BinaryStreamRefBase(Stream, 0, Stream.getLength()) {}
-  BinaryStreamRef(BinaryStream &Stream, uint32_t Offset, uint32_t Length)
-      : BinaryStreamRefBase(Stream, Offset, Length) {}
+  BinaryStreamRef(BinaryStream &Stream);
+  BinaryStreamRef(BinaryStream &Stream, uint32_t Offset, uint32_t Length);
+  explicit BinaryStreamRef(ArrayRef<uint8_t> Data,
+                           llvm::support::endianness Endian);
+  explicit BinaryStreamRef(StringRef Data, llvm::support::endianness Endian);
+
+  BinaryStreamRef(const BinaryStreamRef &Other);
 
   // Use BinaryStreamRef.slice() instead.
   BinaryStreamRef(BinaryStreamRef &S, uint32_t Offset,
                   uint32_t Length) = delete;
 
-  /// Check if a Stream is valid.
-  bool valid() const { return Stream != nullptr; }
-
   /// Given an Offset into this StreamRef and a Size, return a reference to a
   /// buffer owned by the stream.
   ///
@@ -108,12 +155,7 @@ public:
   /// bounds of this BinaryStreamRef's view and the implementation could read
   /// the data, and an appropriate error code otherwise.
   Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const {
-    if (auto EC = checkOffset(Offset, Size))
-      return EC;
-
-    return Stream->readBytes(ViewOffset + Offset, Size, Buffer);
-  }
+                  ArrayRef<uint8_t> &Buffer) const;
 
   /// Given an Offset into this BinaryStreamRef, return a reference to the
   /// largest buffer the stream could support without necessitating a copy.
@@ -121,33 +163,25 @@ public:
   /// \returns a success error code if implementation could read the data,
   /// and an appropriate error code otherwise.
   Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const {
-    if (auto EC = checkOffset(Offset, 1))
-      return EC;
-
-    if (auto EC =
-            Stream->readLongestContiguousChunk(ViewOffset + Offset, Buffer))
-      return EC;
-    // This StreamRef might refer to a smaller window over a larger stream.  In
-    // that case we will have read out more bytes than we should return, because
-    // we should not read past the end of the current view.
-    uint32_t MaxLength = Length - Offset;
-    if (Buffer.size() > MaxLength)
-      Buffer = Buffer.slice(0, MaxLength);
-    return Error::success();
-  }
+                                   ArrayRef<uint8_t> &Buffer) const;
 };
 
 class WritableBinaryStreamRef
-    : public BinaryStreamRefBase<WritableBinaryStream,
-                                 WritableBinaryStreamRef> {
+    : public BinaryStreamRefBase<WritableBinaryStreamRef,
+                                 WritableBinaryStream> {
+  friend BinaryStreamRefBase<WritableBinaryStreamRef, WritableBinaryStream>;
+  WritableBinaryStreamRef(std::shared_ptr<WritableBinaryStream> Impl,
+                          uint32_t ViewOffset, uint32_t Length)
+      : BinaryStreamRefBase(Impl, ViewOffset, Length) {}
+
 public:
   WritableBinaryStreamRef() = default;
-  WritableBinaryStreamRef(WritableBinaryStream &Stream)
-      : BinaryStreamRefBase(Stream, 0, Stream.getLength()) {}
+  WritableBinaryStreamRef(WritableBinaryStream &Stream);
   WritableBinaryStreamRef(WritableBinaryStream &Stream, uint32_t Offset,
-                          uint32_t Length)
-      : BinaryStreamRefBase(Stream, Offset, Length) {}
+                          uint32_t Length);
+  explicit WritableBinaryStreamRef(MutableArrayRef<uint8_t> Data,
+                                   llvm::support::endianness Endian);
+  WritableBinaryStreamRef(const WritableBinaryStreamRef &Other);
 
   // Use WritableBinaryStreamRef.slice() instead.
   WritableBinaryStreamRef(WritableBinaryStreamRef &S, uint32_t Offset,
@@ -159,17 +193,13 @@ public:
   /// \returns a success error code if the data could fit within the underlying
   /// stream at the specified location and the implementation could write the
   /// data, and an appropriate error code otherwise.
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const {
-    if (auto EC = checkOffset(Offset, Data.size()))
-      return EC;
-
-    return Stream->writeBytes(ViewOffset + Offset, Data);
-  }
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const;
 
-  operator BinaryStreamRef() { return BinaryStreamRef(*Stream); }
+  /// Conver this WritableBinaryStreamRef to a read-only BinaryStreamRef.
+  operator BinaryStreamRef() const;
 
   /// \brief For buffered streams, commits changes to the backing store.
-  Error commit() { return Stream->commit(); }
+  Error commit();
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/BinaryStreamWriter.h b/include/llvm/Support/BinaryStreamWriter.h
index 1b61c32a2541..a4495a1ce27d 100644
--- a/include/llvm/Support/BinaryStreamWriter.h
+++ b/include/llvm/Support/BinaryStreamWriter.h
@@ -32,7 +32,20 @@ namespace llvm {
 class BinaryStreamWriter {
 public:
   BinaryStreamWriter() = default;
-  explicit BinaryStreamWriter(WritableBinaryStreamRef Stream);
+  explicit BinaryStreamWriter(WritableBinaryStreamRef Ref);
+  explicit BinaryStreamWriter(WritableBinaryStream &Stream);
+  explicit BinaryStreamWriter(MutableArrayRef<uint8_t> Data,
+                              llvm::support::endianness Endian);
+
+  BinaryStreamWriter(const BinaryStreamWriter &Other)
+      : Stream(Other.Stream), Offset(Other.Offset) {}
+
+  BinaryStreamWriter &operator=(const BinaryStreamWriter &Other) {
+    Stream = Other.Stream;
+    Offset = Other.Offset;
+    return *this;
+  }
+
   virtual ~BinaryStreamWriter() {}
 
   /// Write the bytes specified in \p Buffer to the underlying stream.
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index e3c5de7fbe64..7caefb5359b8 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -261,7 +261,7 @@ struct file_magic {
     coff_object,              ///< COFF object file
     coff_import_library,      ///< COFF import library
     pecoff_executable,        ///< PECOFF executable file
-    windows_resource,         ///< Windows compiled resource file (.rc)
+    windows_resource,         ///< Windows compiled resource file (.res)
     wasm_object               ///< WebAssembly Object file
   };
 
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 071ec2edb538..a06c67fe814c 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -62,7 +62,6 @@ def : GINodeEquiv<G_FMUL, fmul>;
 def : GINodeEquiv<G_FDIV, fdiv>;
 def : GINodeEquiv<G_FREM, frem>;
 def : GINodeEquiv<G_FPOW, fpow>;
-def : GINodeEquiv<G_INTRINSIC, intrinsic_wo_chain>;
 def : GINodeEquiv<G_BR, br>;
 
 // Specifies the GlobalISel equivalents for SelectionDAG's ComplexPattern.
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index ed5742ab8b56..d66b6edc7a4f 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -53,8 +53,7 @@ public:
       : Index(Index), ModuleLoader(std::move(ModuleLoader)) {}
 
   /// Import functions in Module \p M based on the supplied import list.
-  Expected<bool>
-  importFunctions(Module &M, const ImportMapTy &ImportList);
+  Expected<bool> importFunctions(Module &M, const ImportMapTy &ImportList);
 
 private:
   /// The summaries index used to trigger importing.
diff --git a/include/llvm/Transforms/Scalar/GVNExpression.h b/include/llvm/Transforms/Scalar/GVNExpression.h
index 2670a0c1a533..a971df975b6f 100644
--- a/include/llvm/Transforms/Scalar/GVNExpression.h
+++ b/include/llvm/Transforms/Scalar/GVNExpression.h
@@ -40,6 +40,7 @@ enum ExpressionType {
   ET_Base,
   ET_Constant,
   ET_Variable,
+  ET_Dead,
   ET_Unknown,
   ET_BasicStart,
   ET_Basic,
@@ -380,7 +381,9 @@ public:
       OS << "ExpressionTypeStore, ";
     this->BasicExpression::printInternal(OS, false);
     OS << " represents Store  " << *Store;
-    OS << " with MemoryLeader " << *getMemoryLeader();
+    OS << " with StoredValue ";
+    StoredValue->printAsOperand(OS);
+    OS << " and MemoryLeader " << *getMemoryLeader();
   }
 };
 
@@ -513,6 +516,17 @@ public:
   }
 };
 
+class DeadExpression final : public Expression {
+public:
+  DeadExpression() : Expression(ET_Dead) {}
+  DeadExpression(const DeadExpression &) = delete;
+  DeadExpression &operator=(const DeadExpression &) = delete;
+
+  static bool classof(const Expression *E) {
+    return E->getExpressionType() == ET_Dead;
+  }
+};
+
 class VariableExpression final : public Expression {
 private:
   Value *VariableValue;
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 665dd6f4b257..6aba9b2298b1 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -121,6 +121,7 @@ private:
   Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeWcslen(CallInst *CI, IRBuilder<> &B);
   // Wrapper for all String/Memory Library Call Optimizations
   Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);
 
@@ -165,6 +166,9 @@ private:
   /// hasFloatVersion - Checks if there is a float version of the specified
   /// function by checking for an existing function with name FuncName + f
   bool hasFloatVersion(StringRef FuncName);
+
+  /// Shared code to optimize strlen+wcslen.
+  Value *optimizeStringLength(CallInst *CI, IRBuilder<> &B, unsigned CharSize);
 };
 } // End llvm namespace
 
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index a33c01a0e461..f743cb234c45 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -683,8 +683,11 @@ static bool isIntrinsicCall(ImmutableCallSite CS, Intrinsic::ID IID) {
 
 #ifndef NDEBUG
 static const Function *getParent(const Value *V) {
-  if (const Instruction *inst = dyn_cast<Instruction>(V))
+  if (const Instruction *inst = dyn_cast<Instruction>(V)) {
+    if (!inst->getParent())
+      return nullptr;
     return inst->getParent()->getParent();
+  }
 
   if (const Argument *arg = dyn_cast<Argument>(V))
     return arg->getParent();
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index db87b17c1567..267e19adfe4d 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -58,45 +58,12 @@ char BranchProbabilityInfoWrapperPass::ID = 0;
 static const uint32_t LBH_TAKEN_WEIGHT = 124;
 static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
 
-/// \brief Unreachable-terminating branch taken weight.
+/// \brief Unreachable-terminating branch taken probability.
 ///
-/// This is the weight for a branch being taken to a block that terminates
+/// This is the probability for a branch being taken to a block that terminates
 /// (eventually) in unreachable. These are predicted as unlikely as possible.
-static const uint32_t UR_TAKEN_WEIGHT = 1;
-
-/// \brief Unreachable-terminating branch not-taken weight.
-///
-/// This is the weight for a branch not being taken toward a block that
-/// terminates (eventually) in unreachable. Such a branch is essentially never
-/// taken. Set the weight to an absurdly high value so that nested loops don't
-/// easily subsume it.
-static const uint32_t UR_NONTAKEN_WEIGHT = 1024*1024 - 1;
-
-/// \brief Returns the branch probability for unreachable edge according to
-/// heuristic.
-///
-/// This is the branch probability being taken to a block that terminates
-/// (eventually) in unreachable. These are predicted as unlikely as possible.
-static BranchProbability getUnreachableProbability(uint64_t UnreachableCount) {
-  assert(UnreachableCount > 0 && "UnreachableCount must be > 0");
-  return BranchProbability::getBranchProbability(
-      UR_TAKEN_WEIGHT,
-      (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) * UnreachableCount);
-}
-
-/// \brief Returns the branch probability for reachable edge according to
-/// heuristic.
-///
-/// This is the branch probability not being taken toward a block that
-/// terminates (eventually) in unreachable. Such a branch is essentially never
-/// taken. Set the weight to an absurdly high value so that nested loops don't
-/// easily subsume it.
-static BranchProbability getReachableProbability(uint64_t ReachableCount) {
-  assert(ReachableCount > 0 && "ReachableCount must be > 0");
-  return BranchProbability::getBranchProbability(
-      UR_NONTAKEN_WEIGHT,
-      (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) * ReachableCount);
-}
+/// All reachable probability will equally share the remaining part.
+static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 
 /// \brief Weight for a branch taken going into a cold block.
 ///
@@ -232,8 +199,10 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
     return true;
   }
 
-  auto UnreachableProb = getUnreachableProbability(UnreachableEdges.size());
-  auto ReachableProb = getReachableProbability(ReachableEdges.size());
+  auto UnreachableProb = UR_TAKEN_PROB;
+  auto ReachableProb =
+      (BranchProbability::getOne() - UR_TAKEN_PROB * UnreachableEdges.size()) /
+      ReachableEdges.size();
 
   for (unsigned SuccIdx : UnreachableEdges)
     setEdgeProbability(BB, SuccIdx, UnreachableProb);
@@ -319,7 +288,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   // If the unreachable heuristic is more strong then we use it for this edge.
   if (UnreachableIdxs.size() > 0 && ReachableIdxs.size() > 0) {
     auto ToDistribute = BranchProbability::getZero();
-    auto UnreachableProb = getUnreachableProbability(UnreachableIdxs.size());
+    auto UnreachableProb = UR_TAKEN_PROB;
     for (auto i : UnreachableIdxs)
       if (UnreachableProb < BP[i]) {
         ToDistribute += BP[i] - UnreachableProb;
diff --git a/lib/Analysis/CallGraphSCCPass.cpp b/lib/Analysis/CallGraphSCCPass.cpp
index 8058e5b1935c..5896e6e0902f 100644
--- a/lib/Analysis/CallGraphSCCPass.cpp
+++ b/lib/Analysis/CallGraphSCCPass.cpp
@@ -477,10 +477,8 @@ bool CGPassManager::runOnModule(Module &M) {
     if (DevirtualizedCall)
       DEBUG(dbgs() << "  CGSCCPASSMGR: Stopped iteration after " << Iteration
                    << " times, due to -max-cg-scc-iterations\n");
-    
-    if (Iteration > MaxSCCIterations)
-      MaxSCCIterations = Iteration;
-    
+
+    MaxSCCIterations.updateMax(Iteration);
   }
   Changed |= doFinalization(CG);
   return Changed;
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 5652248a60ce..2e72d5aa8269 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -126,8 +126,8 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
 static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
-                          Instruction::BinaryOps OpcodeToExpand, const SimplifyQuery &Q,
-                          unsigned MaxRecurse) {
+                          Instruction::BinaryOps OpcodeToExpand,
+                          const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -184,7 +184,8 @@ static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
 static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
-                                       Value *LHS, Value *RHS, const SimplifyQuery &Q,
+                                       Value *LHS, Value *RHS,
+                                       const SimplifyQuery &Q,
                                        unsigned MaxRecurse) {
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
@@ -2260,28 +2261,49 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
   if (!OpTy->getScalarType()->isIntegerTy(1))
     return nullptr;
 
-  switch (Pred) {
-  default:
-    break;
-  case ICmpInst::ICMP_EQ:
-    // X == 1 -> X
-    if (match(RHS, m_One()))
-      return LHS;
-    break;
-  case ICmpInst::ICMP_NE:
-    // X != 0 -> X
-    if (match(RHS, m_Zero()))
+  // A boolean compared to true/false can be simplified in 14 out of the 20
+  // (10 predicates * 2 constants) possible combinations. Cases not handled here
+  // require a 'not' of the LHS, so those must be transformed in InstCombine.
+  if (match(RHS, m_Zero())) {
+    switch (Pred) {
+    case CmpInst::ICMP_NE:  // X !=  0 -> X
+    case CmpInst::ICMP_UGT: // X >u  0 -> X
+    case CmpInst::ICMP_SLT: // X <s  0 -> X
       return LHS;
-    break;
-  case ICmpInst::ICMP_UGT:
-    // X >u 0 -> X
-    if (match(RHS, m_Zero()))
+
+    case CmpInst::ICMP_ULT: // X <u  0 -> false
+    case CmpInst::ICMP_SGT: // X >s  0 -> false
+      return getFalse(ITy);
+
+    case CmpInst::ICMP_UGE: // X >=u 0 -> true
+    case CmpInst::ICMP_SLE: // X <=s 0 -> true
+      return getTrue(ITy);
+
+    default: break;
+    }
+  } else if (match(RHS, m_One())) {
+    switch (Pred) {
+    case CmpInst::ICMP_EQ:  // X ==   1 -> X
+    case CmpInst::ICMP_UGE: // X >=u  1 -> X
+    case CmpInst::ICMP_SLE: // X <=s -1 -> X
       return LHS;
+
+    case CmpInst::ICMP_UGT: // X >u   1 -> false
+    case CmpInst::ICMP_SLT: // X <s  -1 -> false
+      return getFalse(ITy);
+
+    case CmpInst::ICMP_ULE: // X <=u  1 -> true
+    case CmpInst::ICMP_SGE: // X >=s -1 -> true
+      return getTrue(ITy);
+
+    default: break;
+    }
+  }
+
+  switch (Pred) {
+  default:
     break;
   case ICmpInst::ICMP_UGE:
-    // X >=u 1 -> X
-    if (match(RHS, m_One()))
-      return LHS;
     if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
@@ -2296,16 +2318,6 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SLT:
-    // X <s 0 -> X
-    if (match(RHS, m_Zero()))
-      return LHS;
-    break;
-  case ICmpInst::ICMP_SLE:
-    // X <=s -1 -> X
-    if (match(RHS, m_One()))
-      return LHS;
-    break;
   case ICmpInst::ICMP_ULE:
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 2480fe44d5c0..e0e04a91410f 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -1799,6 +1799,15 @@ bool MemorySSA::dominates(const MemoryAccess *Dominator,
 
 const static char LiveOnEntryStr[] = "liveOnEntry";
 
+void MemoryAccess::print(raw_ostream &OS) const {
+  switch (getValueID()) {
+  case MemoryPhiVal: return static_cast<const MemoryPhi *>(this)->print(OS);
+  case MemoryDefVal: return static_cast<const MemoryDef *>(this)->print(OS);
+  case MemoryUseVal: return static_cast<const MemoryUse *>(this)->print(OS);
+  }
+  llvm_unreachable("invalid value id");
+}
+
 void MemoryDef::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
 
@@ -1836,8 +1845,6 @@ void MemoryPhi::print(raw_ostream &OS) const {
   OS << ')';
 }
 
-MemoryAccess::~MemoryAccess() {}
-
 void MemoryUse::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
   OS << "MemoryUse(";
@@ -2054,3 +2061,15 @@ MemoryAccess *DoNothingMemorySSAWalker::getClobberingMemoryAccess(
   return StartingAccess;
 }
 } // namespace llvm
+
+void MemoryPhi::deleteMe(DerivedUser *Self) {
+  delete static_cast<MemoryPhi *>(Self);
+}
+
+void MemoryDef::deleteMe(DerivedUser *Self) {
+  delete static_cast<MemoryDef *>(Self);
+}
+
+void MemoryUse::deleteMe(DerivedUser *Self) {
+  delete static_cast<MemoryUse *>(Self);
+}
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index a746ddfd7a63..78ded8141c08 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3885,7 +3885,7 @@ public:
       : SCEVRewriteVisitor(SE), L(L), Valid(true) {}
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant))
+    if (!SE.isLoopInvariant(Expr, L))
       Valid = false;
     return Expr;
   }
@@ -3919,7 +3919,7 @@ public:
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     // Only allow AddRecExprs for this loop.
-    if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant))
+    if (!SE.isLoopInvariant(Expr, L))
       Valid = false;
     return Expr;
   }
@@ -5947,6 +5947,8 @@ ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
   if (any_of(ExitNotTaken, PredicateNotAlwaysTrue) || !getMax())
     return SE->getCouldNotCompute();
 
+  assert((isa<SCEVCouldNotCompute>(getMax()) || isa<SCEVConstant>(getMax())) &&
+         "No point in having a non-constant max backedge taken count!");
   return getMax();
 }
 
@@ -5972,7 +5974,11 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
 }
 
 ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
-    : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+    : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+}
 
 ScalarEvolution::ExitLimit::ExitLimit(
     const SCEV *E, const SCEV *M, bool MaxOrZero,
@@ -5981,6 +5987,9 @@ ScalarEvolution::ExitLimit::ExitLimit(
   assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
           !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
          "Exact is not allowed to be less precise than Max");
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
   for (auto *PredSet : PredSetList)
     for (auto *P : *PredSet)
       addPredicate(P);
@@ -5989,11 +5998,19 @@ ScalarEvolution::ExitLimit::ExitLimit(
 ScalarEvolution::ExitLimit::ExitLimit(
     const SCEV *E, const SCEV *M, bool MaxOrZero,
     const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
-    : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+    : ExitLimit(E, M, MaxOrZero, {&PredSet}) {
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+}
 
 ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
                                       bool MaxOrZero)
-    : ExitLimit(E, M, MaxOrZero, None) {}
+    : ExitLimit(E, M, MaxOrZero, None) {
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+}
 
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
@@ -6018,6 +6035,8 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
 
         return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, std::move(Predicate));
       });
+  assert((isa<SCEVCouldNotCompute>(MaxCount) || isa<SCEVConstant>(MaxCount)) &&
+         "No point in having a non-constant max backedge taken count!");
 }
 
 /// Invalidate this result and free the ExitNotTakenInfo array.
@@ -6279,7 +6298,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
       // to not.
       if (isa<SCEVCouldNotCompute>(MaxBECount) &&
           !isa<SCEVCouldNotCompute>(BECount))
-        MaxBECount = BECount;
+        MaxBECount = getConstant(getUnsignedRange(BECount).getUnsignedMax());
 
       return ExitLimit(BECount, MaxBECount, false,
                        {&EL0.Predicates, &EL1.Predicates});
@@ -7583,13 +7602,20 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
       loopHasNoAbnormalExits(AddRec->getLoop())) {
     const SCEV *Exact =
         getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-    return ExitLimit(Exact, Exact, false, Predicates);
+    const SCEV *Max =
+        Exact == getCouldNotCompute()
+            ? Exact
+            : getConstant(getUnsignedRange(Exact).getUnsignedMax());
+    return ExitLimit(Exact, Max, false, Predicates);
   }
 
   // Solve the general equation.
-  const SCEV *E = SolveLinEquationWithOverflow(
-      StepC->getAPInt(), getNegativeSCEV(Start), *this);
-  return ExitLimit(E, E, false, Predicates);
+  const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(),
+                                               getNegativeSCEV(Start), *this);
+  const SCEV *M = E == getCouldNotCompute()
+                      ? E
+                      : getConstant(getUnsignedRange(E).getUnsignedMax());
+  return ExitLimit(E, M, false, Predicates);
 }
 
 ScalarEvolution::ExitLimit
@@ -9218,8 +9244,9 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                 getConstant(StrideForMaxBECount), false);
   }
 
-  if (isa<SCEVCouldNotCompute>(MaxBECount))
-    MaxBECount = BECount;
+  if (isa<SCEVCouldNotCompute>(MaxBECount) &&
+      !isa<SCEVCouldNotCompute>(BECount))
+    MaxBECount = getConstant(getUnsignedRange(BECount).getUnsignedMax());
 
   return ExitLimit(BECount, MaxBECount, MaxOrZero, Predicates);
 }
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 3cf1bbc5daa5..2be5d5caf7c2 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
@@ -1518,6 +1519,21 @@ TargetLibraryInfoImpl &TargetLibraryAnalysis::lookupInfoImpl(const Triple &T) {
   return *Impl;
 }
 
+unsigned TargetLibraryInfoImpl::getTargetWCharSize(const Triple &T) {
+  // See also clang/lib/Basic/Targets.cpp.
+  if (T.isPS4() || T.isOSWindows() || T.isArch16Bit())
+    return 2;
+  if (T.getArch() == Triple::xcore)
+    return 1;
+  return 4;
+}
+
+unsigned TargetLibraryInfoImpl::getWCharSize(const Module &M) const {
+  if (auto *ShortWChar = cast_or_null<ConstantAsMetadata>(
+      M.getModuleFlag("wchar_size")))
+    return cast<ConstantInt>(ShortWChar->getValue())->getZExtValue();
+  return getTargetWCharSize(Triple(M.getTargetTriple()));
+}
 
 TargetLibraryInfoWrapperPass::TargetLibraryInfoWrapperPass()
     : ImmutablePass(ID), TLIImpl(), TLI(TLIImpl) {
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index cba7363a0afa..8e6c1096eec8 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -2953,14 +2954,16 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
   return Ptr;
 }
 
-bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP) {
+bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
+                                       unsigned CharSize) {
   // Make sure the GEP has exactly three arguments.
   if (GEP->getNumOperands() != 3)
     return false;
 
-  // Make sure the index-ee is a pointer to array of i8.
+  // Make sure the index-ee is a pointer to array of \p CharSize integers.
+  // CharSize.
   ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
-  if (!AT || !AT->getElementType()->isIntegerTy(8))
+  if (!AT || !AT->getElementType()->isIntegerTy(CharSize))
     return false;
 
   // Check to make sure that the first operand of the GEP is an integer and
@@ -2972,11 +2975,9 @@ bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP) {
   return true;
 }
 
-/// This function computes the length of a null-terminated C string pointed to
-/// by V. If successful, it returns true and returns the string in Str.
-/// If unsuccessful, it returns false.
-bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
-                                 uint64_t Offset, bool TrimAtNul) {
+bool llvm::getConstantDataArrayInfo(const Value *V,
+                                    ConstantDataArraySlice &Slice,
+                                    unsigned ElementSize, uint64_t Offset) {
   assert(V);
 
   // Look through bitcast instructions and geps.
@@ -2987,7 +2988,7 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     // The GEP operator should be based on a pointer to string constant, and is
     // indexing into the string constant.
-    if (!isGEPBasedOnPointerToString(GEP))
+    if (!isGEPBasedOnPointerToString(GEP, ElementSize))
       return false;
 
     // If the second index isn't a ConstantInt, then this is a variable index
@@ -2998,8 +2999,8 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
       StartIdx = CI->getZExtValue();
     else
       return false;
-    return getConstantStringInfo(GEP->getOperand(0), Str, StartIdx + Offset,
-                                 TrimAtNul);
+    return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
+                                    StartIdx + Offset);
   }
 
   // The GEP instruction, constant or instruction, must reference a global
@@ -3009,30 +3010,72 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return false;
 
-  // Handle the all-zeros case.
+  const ConstantDataArray *Array;
+  ArrayType *ArrayTy;
   if (GV->getInitializer()->isNullValue()) {
-    // This is a degenerate case. The initializer is constant zero so the
-    // length of the string must be zero.
-    Str = "";
-    return true;
+    Type *GVTy = GV->getValueType();
+    if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
+      // A zeroinitializer for the array; There is no ConstantDataArray.
+      Array = nullptr;
+    } else {
+      const DataLayout &DL = GV->getParent()->getDataLayout();
+      uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
+      uint64_t Length = SizeInBytes / (ElementSize / 8);
+      if (Length <= Offset)
+        return false;
+
+      Slice.Array = nullptr;
+      Slice.Offset = 0;
+      Slice.Length = Length - Offset;
+      return true;
+    }
+  } else {
+    // This must be a ConstantDataArray.
+    Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
+    if (!Array)
+      return false;
+    ArrayTy = Array->getType();
   }
+  if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
+    return false;
 
-  // This must be a ConstantDataArray.
-  const auto *Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
-  if (!Array || !Array->isString())
+  uint64_t NumElts = ArrayTy->getArrayNumElements();
+  if (Offset > NumElts)
     return false;
 
-  // Get the number of elements in the array.
-  uint64_t NumElts = Array->getType()->getArrayNumElements();
+  Slice.Array = Array;
+  Slice.Offset = Offset;
+  Slice.Length = NumElts - Offset;
+  return true;
+}
 
-  // Start out with the entire array in the StringRef.
-  Str = Array->getAsString();
+/// This function computes the length of a null-terminated C string pointed to
+/// by V. If successful, it returns true and returns the string in Str.
+/// If unsuccessful, it returns false.
+bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
+                                 uint64_t Offset, bool TrimAtNul) {
+  ConstantDataArraySlice Slice;
+  if (!getConstantDataArrayInfo(V, Slice, 8, Offset))
+    return false;
 
-  if (Offset > NumElts)
+  if (Slice.Array == nullptr) {
+    if (TrimAtNul) {
+      Str = StringRef();
+      return true;
+    }
+    if (Slice.Length == 1) {
+      Str = StringRef("", 1);
+      return true;
+    }
+    // We cannot instantiate a StringRef as we do not have an apropriate string
+    // of 0s at hand.
     return false;
+  }
 
+  // Start out with the entire array in the StringRef.
+  Str = Slice.Array->getAsString();
   // Skip over 'offset' bytes.
-  Str = Str.substr(Offset);
+  Str = Str.substr(Slice.Offset);
 
   if (TrimAtNul) {
     // Trim off the \0 and anything after it.  If the array is not nul
@@ -3050,7 +3093,8 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 static uint64_t GetStringLengthH(const Value *V,
-                                 SmallPtrSetImpl<const PHINode*> &PHIs) {
+                                 SmallPtrSetImpl<const PHINode*> &PHIs,
+                                 unsigned CharSize) {
   // Look through noop bitcast instructions.
   V = V->stripPointerCasts();
 
@@ -3063,7 +3107,7 @@ static uint64_t GetStringLengthH(const Value *V,
     // If it was new, see if all the input strings are the same length.
     uint64_t LenSoFar = ~0ULL;
     for (Value *IncValue : PN->incoming_values()) {
-      uint64_t Len = GetStringLengthH(IncValue, PHIs);
+      uint64_t Len = GetStringLengthH(IncValue, PHIs, CharSize);
       if (Len == 0) return 0; // Unknown length -> unknown.
 
       if (Len == ~0ULL) continue;
@@ -3079,9 +3123,9 @@ static uint64_t GetStringLengthH(const Value *V,
 
   // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
   if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
-    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs);
+    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs, CharSize);
     if (Len1 == 0) return 0;
-    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs);
+    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs, CharSize);
     if (Len2 == 0) return 0;
     if (Len1 == ~0ULL) return Len2;
     if (Len2 == ~0ULL) return Len1;
@@ -3090,20 +3134,30 @@ static uint64_t GetStringLengthH(const Value *V,
   }
 
   // Otherwise, see if we can read the string.
-  StringRef StrData;
-  if (!getConstantStringInfo(V, StrData))
+  ConstantDataArraySlice Slice;
+  if (!getConstantDataArrayInfo(V, Slice, CharSize))
     return 0;
 
-  return StrData.size()+1;
+  if (Slice.Array == nullptr)
+    return 1;
+
+  // Search for nul characters
+  unsigned NullIndex = 0;
+  for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
+    if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
+      break;
+  }
+
+  return NullIndex + 1;
 }
 
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
-uint64_t llvm::GetStringLength(const Value *V) {
+uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
   if (!V->getType()->isPointerTy()) return 0;
 
   SmallPtrSet<const PHINode*, 32> PHIs;
-  uint64_t Len = GetStringLengthH(V, PHIs);
+  uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
   // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
   // an empty string as a length.
   return Len == ~0ULL ? 1 : Len;
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index d7602c83435c..ff1ea44a18a7 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -2502,7 +2502,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
       continue;
     P.second.first->replaceAllUsesWith(
         UndefValue::get(P.second.first->getType()));
-    delete P.second.first;
+    P.second.first->deleteValue();
   }
 
   for (const auto &P : ForwardRefValIDs) {
@@ -2510,7 +2510,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
       continue;
     P.second.first->replaceAllUsesWith(
         UndefValue::get(P.second.first->getType()));
-    delete P.second.first;
+    P.second.first->deleteValue();
   }
 }
 
@@ -2642,7 +2642,7 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
                        getTypeString(FI->second.first->getType()) + "'");
 
       Sentinel->replaceAllUsesWith(Inst);
-      delete Sentinel;
+      Sentinel->deleteValue();
       ForwardRefValIDs.erase(FI);
     }
 
@@ -2659,7 +2659,7 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
                      getTypeString(FI->second.first->getType()) + "'");
 
     Sentinel->replaceAllUsesWith(Inst);
-    delete Sentinel;
+    Sentinel->deleteValue();
     ForwardRefVals.erase(FI);
   }
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 76298121566a..686c94687669 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4489,11 +4489,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     // Add instruction to end of current BB.  If there is no current BB, reject
     // this file.
     if (!CurBB) {
-      delete I;
+      I->deleteValue();
       return error("Invalid instruction with no BB");
     }
     if (!OperandBundles.empty()) {
-      delete I;
+      I->deleteValue();
       return error("Operand bundles found with no consumer");
     }
     CurBB->getInstList().push_back(I);
diff --git a/lib/Bitcode/Reader/ValueList.cpp b/lib/Bitcode/Reader/ValueList.cpp
index d1a2a11bbfad..f2a3439a87be 100644
--- a/lib/Bitcode/Reader/ValueList.cpp
+++ b/lib/Bitcode/Reader/ValueList.cpp
@@ -73,7 +73,7 @@ void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
     // If there was a forward reference to this value, replace it.
     Value *PrevVal = OldV;
     OldV->replaceAllUsesWith(V);
-    delete PrevVal;
+    PrevVal->deleteValue();
   }
 }
 
@@ -194,6 +194,6 @@ void BitcodeReaderValueList::resolveConstantForwardRefs() {
 
     // Update all ValueHandles, they should be the only users at this point.
     Placeholder->replaceAllUsesWith(RealVal);
-    delete Placeholder;
+    Placeholder->deleteValue();
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 7d945690e9c3..1b39e46ee466 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -13,7 +13,6 @@
 
 #include "CodeViewDebug.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
@@ -23,6 +22,7 @@
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -469,17 +469,21 @@ void CodeViewDebug::emitTypeInformation() {
     CommentPrefix += ' ';
   }
 
-  TypeDatabase TypeDB(TypeTable.records().size());
-  CVTypeDumper CVTD(TypeDB);
-  TypeTable.ForEachRecord([&](TypeIndex Index, ArrayRef<uint8_t> Record) {
+  TypeTableCollection Table(TypeTable.records());
+  Optional<TypeIndex> B = Table.getFirst();
+  while (B) {
+    // This will fail if the record data is invalid.
+    CVType Record = Table.getType(*B);
+
     if (OS.isVerboseAsm()) {
       // Emit a block comment describing the type record for readability.
       SmallString<512> CommentBlock;
       raw_svector_ostream CommentOS(CommentBlock);
       ScopedPrinter SP(CommentOS);
       SP.setPrefix(CommentPrefix);
-      TypeDumpVisitor TDV(TypeDB, &SP, false);
-      Error E = CVTD.dump(Record, TDV);
+      TypeDumpVisitor TDV(Table, &SP, false);
+
+      Error E = codeview::visitTypeRecord(Record, *B, TDV);
       if (E) {
         logAllUnhandledErrors(std::move(E), errs(), "error: ");
         llvm_unreachable("produced malformed type record");
@@ -489,29 +493,10 @@ void CodeViewDebug::emitTypeInformation() {
       // newline.
       OS.emitRawComment(
           CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim());
-    } else {
-#ifndef NDEBUG
-      // Assert that the type data is valid even if we aren't dumping
-      // comments. The MSVC linker doesn't do much type record validation,
-      // so the first link of an invalid type record can succeed while
-      // subsequent links will fail with LNK1285.
-      BinaryByteStream Stream(Record, llvm::support::little);
-      CVTypeArray Types;
-      BinaryStreamReader Reader(Stream);
-      Error E = Reader.readArray(Types, Reader.getLength());
-      if (!E) {
-        TypeVisitorCallbacks C;
-        E = codeview::visitTypeStream(Types, C);
-      }
-      if (E) {
-        logAllUnhandledErrors(std::move(E), errs(), "error: ");
-        llvm_unreachable("produced malformed type record");
-      }
-#endif
     }
-    StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
-    OS.EmitBinaryData(S);
-  });
+    OS.EmitBinaryData(Record.str_data());
+    B = Table.getNext(*B);
+  }
 }
 
 namespace {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 17e6be05eb42..984973cf3a3b 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -35,12 +36,10 @@ using namespace llvm;
 
 namespace {
   class AtomicExpand: public FunctionPass {
-    const TargetMachine *TM;
     const TargetLowering *TLI;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit AtomicExpand(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+    AtomicExpand() : FunctionPass(ID), TLI(nullptr) {
       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     }
 
@@ -97,12 +96,10 @@ namespace {
 
 char AtomicExpand::ID = 0;
 char &llvm::AtomicExpandID = AtomicExpand::ID;
-INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
-                   false, false)
+INITIALIZE_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
+                false, false)
 
-FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
-  return new AtomicExpand(TM);
-}
+FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
 
 namespace {
 // Helper functions to retrieve the size of atomic instructions.
@@ -172,9 +169,14 @@ bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
 } // end anonymous namespace
 
 bool AtomicExpand::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  if (!TM.getSubtargetImpl(F)->enableAtomicExpand())
     return false;
-  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 55a27e2fb79e..2b5863aa5800 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -49,7 +49,6 @@ add_llvm_library(LLVMCodeGen
   LivePhysRegs.cpp
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
-  LiveRangeShrink.cpp
   LiveRegMatrix.cpp
   LiveRegUnits.cpp
   LiveStackAnalysis.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 4d30c6574b12..2a2715beaadc 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -43,7 +43,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
-  initializeLiveRangeShrinkPass(Registry);
   initializeLiveStacksPass(Registry);
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index f2e024c5e3bd..3a1a3020a8d4 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -197,10 +198,11 @@ class TypePromotionTransaction;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
-        initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
-      }
+    CodeGenPrepare()
+        : FunctionPass(ID), TM(nullptr), TLI(nullptr), TTI(nullptr),
+          DL(nullptr) {
+      initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
+    }
     bool runOnFunction(Function &F) override;
 
     StringRef getPassName() const override { return "CodeGen Prepare"; }
@@ -255,15 +257,13 @@ class TypePromotionTransaction;
 }
 
 char CodeGenPrepare::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
-                         "Optimize for code generation", false, false)
+INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
+                      "Optimize for code generation", false, false)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare",
-                       "Optimize for code generation", false, false)
+INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
+                    "Optimize for code generation", false, false)
 
-FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
-  return new CodeGenPrepare(TM);
-}
+FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
 
 bool CodeGenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
@@ -279,7 +279,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   BPI.reset();
 
   ModifiedDT = false;
-  if (TM) {
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    TM = &TPC->getTM<TargetMachine>();
     SubtargetInfo = TM->getSubtargetImpl(F);
     TLI = SubtargetInfo->getTargetLowering();
     TRI = SubtargetInfo->getRegisterInfo();
@@ -349,7 +350,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
     // Really free removed instructions during promotion.
     for (Instruction *I : RemovedInsts)
-      delete I;
+      I->deleteValue();
 
     EverMadeChange |= MadeChange;
   }
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 38af19a04448..1ef4d8660657 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
@@ -34,8 +35,6 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered");
 
 namespace {
   class DwarfEHPrepare : public FunctionPass {
-    const TargetMachine *TM;
-
     // RewindFunction - _Unwind_Resume or the target equivalent.
     Constant *RewindFunction;
 
@@ -52,15 +51,9 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid.
 
-    // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in
-    // practice.
     DwarfEHPrepare()
-        : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr), DT(nullptr),
-          TLI(nullptr) {}
-
-    DwarfEHPrepare(const TargetMachine *TM)
-        : FunctionPass(ID), TM(TM), RewindFunction(nullptr), DT(nullptr),
-          TLI(nullptr) {}
+        : FunctionPass(ID), RewindFunction(nullptr), DT(nullptr), TLI(nullptr) {
+    }
 
     bool runOnFunction(Function &Fn) override;
 
@@ -78,18 +71,18 @@ namespace {
 } // end anonymous namespace
 
 char DwarfEHPrepare::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare",
-                         "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare",
+                      "Prepare DWARF exceptions", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_TM_PASS_END(DwarfEHPrepare, "dwarfehprepare",
-                       "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_END(DwarfEHPrepare, "dwarfehprepare",
+                    "Prepare DWARF exceptions", false, false)
 
-FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
-  return new DwarfEHPrepare(TM);
-}
+FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); }
 
 void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
 }
@@ -254,9 +247,10 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 }
 
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
-  assert(TM && "DWARF EH preparation requires a target machine");
+  const TargetMachine &TM =
+      getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  TLI = TM.getSubtargetImpl(Fn)->getTargetLowering();
   bool Changed = InsertUnwindResumeCalls(Fn);
   DT = nullptr;
   TLI = nullptr;
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 77dfb13ac1f2..afc18a15aa1c 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -340,6 +340,15 @@ bool IRTranslator::translateExtractValue(const User &U,
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
   SmallVector<Value *, 1> Indices;
 
+  // If Src is a single element ConstantStruct, translate extractvalue
+  // to that element to avoid inserting a cast instruction.
+  if (auto CS = dyn_cast<ConstantStruct>(Src))
+    if (CS->getNumOperands() == 1) {
+      unsigned Res = getOrCreateVReg(*CS->getOperand(0));
+      ValToVReg[&U] = Res;
+      return true;
+    }
+
   // getIndexedOffsetInType is designed for GEPs, so the first index is the
   // usual array element rather than looking into the actual aggregate.
   Indices.push_back(ConstantInt::get(Int32Ty, 0));
@@ -1108,6 +1117,23 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
     default:
       return false;
     }
+  } else if (auto CS = dyn_cast<ConstantStruct>(&C)) {
+    // Return the element if it is a single element ConstantStruct.
+    if (CS->getNumOperands() == 1) {
+      unsigned EltReg = getOrCreateVReg(*CS->getOperand(0));
+      EntryBuilder.buildCast(Reg, EltReg);
+      return true;
+    }
+    SmallVector<unsigned, 4> Ops;
+    SmallVector<uint64_t, 4> Indices;
+    uint64_t Offset = 0;
+    for (unsigned i = 0; i < CS->getNumOperands(); ++i) {
+      unsigned OpReg = getOrCreateVReg(*CS->getOperand(i));
+      Ops.push_back(OpReg);
+      Indices.push_back(Offset);
+      Offset += MRI->getType(OpReg).getSizeInBits();
+    }
+    EntryBuilder.buildSequence(Reg, Ops, Indices);
   } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
     if (CV->getNumOperands() == 1)
       return translate(*CV->getOperand(0), Reg);
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index c67da8629a3b..4c0b06dffd21 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -73,7 +73,7 @@ bool InstructionSelector::isOperandImmEqual(
     const MachineOperand &MO, int64_t Value,
     const MachineRegisterInfo &MRI) const {
 
-  if (MO.getReg())
+  if (MO.isReg() && MO.getReg())
     if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI))
       return *VRegVal == Value;
   return false;
diff --git a/lib/CodeGen/InterleavedAccessPass.cpp b/lib/CodeGen/InterleavedAccessPass.cpp
index ec35b3f6449e..bb29db301a95 100644
--- a/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
@@ -68,8 +69,7 @@ class InterleavedAccess : public FunctionPass {
 
 public:
   static char ID;
-  InterleavedAccess(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
+  InterleavedAccess() : FunctionPass(ID), DT(nullptr), TLI(nullptr) {
     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
   }
 
@@ -84,7 +84,6 @@ public:
 
 private:
   DominatorTree *DT;
-  const TargetMachine *TM;
   const TargetLowering *TLI;
 
   /// The maximum supported interleave factor.
@@ -108,18 +107,18 @@ private:
 } // end anonymous namespace.
 
 char InterleavedAccess::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(
+INITIALIZE_PASS_BEGIN(
     InterleavedAccess, "interleaved-access",
     "Lower interleaved memory accesses to target specific intrinsics", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_TM_PASS_END(
+INITIALIZE_PASS_END(
     InterleavedAccess, "interleaved-access",
     "Lower interleaved memory accesses to target specific intrinsics", false,
     false)
 
-FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
-  return new InterleavedAccess(TM);
+FunctionPass *llvm::createInterleavedAccessPass() {
+  return new InterleavedAccess();
 }
 
 /// \brief Check if the mask is a DE-interleave mask of the given factor
@@ -426,13 +425,15 @@ bool InterleavedAccess::lowerInterleavedStore(
 }
 
 bool InterleavedAccess::runOnFunction(Function &F) {
-  if (!TM || !LowerInterleavedAccesses)
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC || !LowerInterleavedAccesses)
     return false;
 
   DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto &TM = TPC->getTM<TargetMachine>();
+  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
 
   // Holds dead instructions that will be erased later.
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 7b1706f0f4ba..be3b258315bb 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -109,26 +109,24 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
                         AnalysisID StopAfter,
                         MachineFunctionInitializer *MFInitializer = nullptr) {
 
-  // When in emulated TLS mode, add the LowerEmuTLS pass.
-  if (TM->Options.EmulatedTLS)
-    PM.add(createLowerEmuTLSPass(TM));
-
-  PM.add(createPreISelIntrinsicLoweringPass());
-
-  // Add internal analysis passes from the target machine.
-  PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
-
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
   TargetPassConfig *PassConfig = TM->createPassConfig(PM);
   PassConfig->setStartStopPasses(StartBefore, StartAfter, StopBefore,
                                  StopAfter);
-
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
-
   PM.add(PassConfig);
 
+  // When in emulated TLS mode, add the LowerEmuTLS pass.
+  if (TM->Options.EmulatedTLS)
+    PM.add(createLowerEmuTLSPass());
+
+  PM.add(createPreISelIntrinsicLoweringPass());
+
+  // Add internal analysis passes from the target machine.
+  PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+
   PassConfig->addIRPasses();
 
   PassConfig->addCodeGenPrepare();
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
deleted file mode 100644
index 00182e2c779f..000000000000
--- a/lib/CodeGen/LiveRangeShrink.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-///===---------------------------------------------------------------------===//
-///
-/// \file
-/// This pass moves instructions close to the definition of its operands to
-/// shrink live range of the def instruction. The code motion is limited within
-/// the basic block. The moved instruction should have 1 def, and more than one
-/// uses, all of which are the only use of the def.
-///
-///===---------------------------------------------------------------------===//
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "lrshrink"
-
-STATISTIC(NumInstrsHoistedToShrinkLiveRange,
-          "Number of insructions hoisted to shrink live range.");
-
-using namespace llvm;
-
-namespace {
-class LiveRangeShrink : public MachineFunctionPass {
-public:
-  static char ID;
-
-  LiveRangeShrink() : MachineFunctionPass(ID) {
-    initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  StringRef getPassName() const override { return "Live Range Shrink"; }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-} // End anonymous namespace.
-
-char LiveRangeShrink::ID = 0;
-char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
-
-INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
-                false)
-namespace {
-typedef DenseMap<MachineInstr *, unsigned> InstOrderMap;
-
-/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
-/// \p M maintains a map from instruction to its dominating order that satisfies
-/// M[A] > M[B] guarantees that A is dominated by B.
-/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
-/// \p New.
-MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
-                                       const InstOrderMap &M) {
-  auto NewIter = M.find(&New);
-  if (NewIter == M.end())
-    return Old;
-  if (Old == nullptr)
-    return &New;
-  unsigned OrderOld = M.find(Old)->second;
-  unsigned OrderNew = NewIter->second;
-  if (OrderOld != OrderNew)
-    return OrderOld < OrderNew ? &New : Old;
-  // OrderOld == OrderNew, we need to iterate down from Old to see if it
-  // can reach New, if yes, New is dominated by Old.
-  for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
-       I = I->getNextNode())
-    if (I == &New)
-      return &New;
-  return Old;
-}
-
-/// Builds Instruction to its dominating order number map \p M by traversing
-/// from instruction \p Start.
-void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
-  M.clear();
-  unsigned i = 0;
-  for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
-    M[&I] = i++;
-}
-} // end anonymous namespace
-
-bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
-  if (skipFunction(*MF.getFunction()))
-    return false;
-
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
-
-  InstOrderMap IOM;
-  // Map from register to instruction order (value of IOM) where the
-  // register is used last. When moving instructions up, we need to
-  // make sure all its defs (including dead def) will not cross its
-  // last use when moving up.
-  DenseMap<unsigned, unsigned> UseMap;
-
-  for (MachineBasicBlock &MBB : MF) {
-    if (MBB.empty())
-      continue;
-    bool SawStore = false;
-    BuildInstOrderMap(MBB.begin(), IOM);
-    UseMap.clear();
-
-    for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
-      MachineInstr &MI = *Next;
-      ++Next;
-      if (MI.isPHI() || MI.isDebugValue())
-        continue;
-      if (MI.mayStore())
-        SawStore = true;
-
-      unsigned CurrentOrder = IOM[&MI];
-      unsigned Barrier = 0;
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg() || MO.isDebug())
-          continue;
-        if (MO.isUse())
-          UseMap[MO.getReg()] = CurrentOrder;
-        else if (MO.isDead() && UseMap.count(MO.getReg()))
-          // Barrier is the last instruction where MO get used. MI should not
-          // be moved above Barrier.
-          Barrier = std::max(Barrier, UseMap[MO.getReg()]);
-      }
-
-      if (!MI.isSafeToMove(nullptr, SawStore)) {
-        // If MI has side effects, it should become a barrier for code motion.
-        // IOM is rebuild from the next instruction to prevent later
-        // instructions from being moved before this MI.
-        if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
-          BuildInstOrderMap(Next, IOM);
-          SawStore = false;
-        }
-        continue;
-      }
-
-      const MachineOperand *DefMO = nullptr;
-      MachineInstr *Insert = nullptr;
-
-      // Number of live-ranges that will be shortened. We do not count
-      // live-ranges that are defined by a COPY as it could be coalesced later.
-      unsigned NumEligibleUse = 0;
-
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg() || MO.isDead() || MO.isDebug())
-          continue;
-        unsigned Reg = MO.getReg();
-        // Do not move the instruction if it def/uses a physical register,
-        // unless it is a constant physical register.
-        if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
-            !MRI.isConstantPhysReg(Reg)) {
-          Insert = nullptr;
-          break;
-        }
-        if (MO.isDef()) {
-          // Do not move if there is more than one def.
-          if (DefMO) {
-            Insert = nullptr;
-            break;
-          }
-          DefMO = &MO;
-        } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) {
-          MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
-          if (!DefInstr.isCopy())
-            NumEligibleUse++;
-          Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
-        } else {
-          Insert = nullptr;
-          break;
-        }
-      }
-      // Move the instruction when # of shrunk live range > 1.
-      if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
-        MachineBasicBlock::iterator I = std::next(Insert->getIterator());
-        // Skip all the PHI and debug instructions.
-        while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
-          I = std::next(I);
-        if (I == MI.getIterator())
-          continue;
-
-        // Update the dominator order to be the same as the insertion point.
-        // We do this to maintain a non-decreasing order without need to update
-        // all instruction orders after the insertion point.
-        unsigned NewOrder = IOM[&*I];
-        IOM[&MI] = NewOrder;
-        NumInstrsHoistedToShrinkLiveRange++;
-
-        // Find MI's debug value following MI.
-        MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
-        if (MI.getOperand(0).isReg())
-          for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
-                 EndIter->getOperand(0).isReg() &&
-                 EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
-               ++EndIter, ++Next)
-            IOM[&*EndIter] = NewOrder;
-        MBB.splice(I, &MBB, MI.getIterator(), EndIter);
-      }
-    }
-  }
-  return false;
-}
diff --git a/lib/CodeGen/LowerEmuTLS.cpp b/lib/CodeGen/LowerEmuTLS.cpp
index 6966c8ca4a5f..5fb5b747f471 100644
--- a/lib/CodeGen/LowerEmuTLS.cpp
+++ b/lib/CodeGen/LowerEmuTLS.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
@@ -28,14 +29,12 @@ using namespace llvm;
 namespace {
 
 class LowerEmuTLS : public ModulePass {
-  const TargetMachine *TM;
 public:
   static char ID; // Pass identification, replacement for typeid
-  explicit LowerEmuTLS() : ModulePass(ID), TM(nullptr) { }
-  explicit LowerEmuTLS(const TargetMachine *TM)
-      : ModulePass(ID), TM(TM) {
+  LowerEmuTLS() : ModulePass(ID) {
     initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry());
   }
+
   bool runOnModule(Module &M) override;
 private:
   bool addEmuTlsVar(Module &M, const GlobalVariable *GV);
@@ -55,18 +54,21 @@ private:
 char LowerEmuTLS::ID = 0;
 
 INITIALIZE_PASS(LowerEmuTLS, "loweremutls",
-                "Add __emutls_[vt]. variables for emultated TLS model",
-                false, false)
+                "Add __emutls_[vt]. variables for emultated TLS model", false,
+                false)
 
-ModulePass *llvm::createLowerEmuTLSPass(const TargetMachine *TM) {
-  return new LowerEmuTLS(TM);
-}
+ModulePass *llvm::createLowerEmuTLSPass() { return new LowerEmuTLS(); }
 
 bool LowerEmuTLS::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
-  if (!TM || !TM->Options.EmulatedTLS)
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  if (!TM.Options.EmulatedTLS)
     return false;
 
   bool Changed = false;
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 5003115a770f..adfca9a46239 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -245,25 +245,26 @@ public:
   /// updating the block -> chain mapping. It does not free or tear down the
   /// old chain, but the old chain's block list is no longer valid.
   void merge(MachineBasicBlock *BB, BlockChain *Chain) {
-    assert(BB);
-    assert(!Blocks.empty());
+    assert(BB && "Can't merge a null block.");
+    assert(!Blocks.empty() && "Can't merge into an empty chain.");
 
     // Fast path in case we don't have a chain already.
     if (!Chain) {
-      assert(!BlockToChain[BB]);
+      assert(!BlockToChain[BB] &&
+             "Passed chain is null, but BB has entry in BlockToChain.");
       Blocks.push_back(BB);
       BlockToChain[BB] = this;
       return;
     }
 
-    assert(BB == *Chain->begin());
+    assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
     assert(Chain->begin() != Chain->end());
 
     // Update the incoming blocks to point to this chain, and add them to the
     // chain structure.
     for (MachineBasicBlock *ChainBB : *Chain) {
       Blocks.push_back(ChainBB);
-      assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain");
+      assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
       BlockToChain[ChainBB] = this;
     }
   }
@@ -1547,13 +1548,15 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
   MachineBasicBlock *BestBlock = nullptr;
   BlockFrequency BestFreq;
   for (MachineBasicBlock *MBB : WorkList) {
-    assert(MBB->isEHPad() == IsEHPad);
+    assert(MBB->isEHPad() == IsEHPad &&
+           "EHPad mismatch between block and work list.");
 
     BlockChain &SuccChain = *BlockToChain[MBB];
     if (&SuccChain == &Chain)
       continue;
 
-    assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block");
+    assert(SuccChain.UnscheduledPredecessors == 0 &&
+           "Found CFG-violating block");
 
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
     DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
@@ -1621,9 +1624,12 @@ void MachineBlockPlacement::fillWorkLists(
   if (!UpdatedPreds.insert(&Chain).second)
     return;
 
-  assert(Chain.UnscheduledPredecessors == 0);
+  assert(
+      Chain.UnscheduledPredecessors == 0 &&
+      "Attempting to place block with unscheduled predecessors in worklist.");
   for (MachineBasicBlock *ChainBB : Chain) {
-    assert(BlockToChain[ChainBB] == &Chain);
+    assert(BlockToChain[ChainBB] == &Chain &&
+           "Block in chain doesn't match BlockToChain map.");
     for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
       if (BlockFilter && !BlockFilter->count(Pred))
         continue;
@@ -2136,8 +2142,10 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   for (const MachineLoop *InnerLoop : L)
     buildLoopChains(*InnerLoop);
 
-  assert(BlockWorkList.empty());
-  assert(EHPadWorkList.empty());
+  assert(BlockWorkList.empty() &&
+         "BlockWorkList not empty when starting to build loop chains.");
+  assert(EHPadWorkList.empty() &&
+         "EHPadWorkList not empty when starting to build loop chains.");
   BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
 
   // Check if we have profile data for this function. If yes, we will rotate
@@ -2167,7 +2175,8 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // walk the blocks, and use a set to prevent visiting a particular chain
   // twice.
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
-  assert(LoopChain.UnscheduledPredecessors == 0);
+  assert(LoopChain.UnscheduledPredecessors == 0 &&
+         "LoopChain should not have unscheduled predecessors.");
   UpdatedPreds.insert(&LoopChain);
 
   for (const MachineBasicBlock *LoopBB : LoopBlockSet)
@@ -2256,8 +2265,10 @@ void MachineBlockPlacement::buildCFGChains() {
   for (MachineLoop *L : *MLI)
     buildLoopChains(*L);
 
-  assert(BlockWorkList.empty());
-  assert(EHPadWorkList.empty());
+  assert(BlockWorkList.empty() &&
+         "BlockWorkList should be empty before building final chain.");
+  assert(EHPadWorkList.empty() &&
+         "EHPadWorkList should be empty before building final chain.");
 
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   for (MachineBasicBlock &MBB : *F)
@@ -2651,8 +2662,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
 
-  assert(BlockToChain.empty());
-  assert(ComputedEdges.empty());
+  assert(BlockToChain.empty() &&
+         "BlockToChain map should be empty before starting placement.");
+  assert(ComputedEdges.empty() &&
+         "Computed Edge map should be empty before starting placement.");
 
   unsigned TailDupSize = TailDupPlacementThreshold;
   // If only the aggressive threshold is explicitly set, use it.
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 2f0f4297ef5c..6cf751d34e26 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -32,8 +32,8 @@ using namespace llvm;
 using namespace llvm::dwarf;
 
 // Handle the Pass registration stuff necessary to use DataLayout's.
-INITIALIZE_TM_PASS(MachineModuleInfo, "machinemoduleinfo",
-                   "Machine Module Information", false, false)
+INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
+                "Machine Module Information", false, false)
 char MachineModuleInfo::ID = 0;
 
 // Out of line virtual method.
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index d2afeae9e70b..aaa253fde494 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -60,19 +60,8 @@ namespace {
 class PEI : public MachineFunctionPass {
 public:
   static char ID;
-  explicit PEI(const TargetMachine *TM = nullptr) : MachineFunctionPass(ID) {
+  PEI() : MachineFunctionPass(ID) {
     initializePEIPass(*PassRegistry::getPassRegistry());
-
-    if (TM && (!TM->usesPhysRegsForPEI())) {
-      SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
-                                     unsigned &, unsigned &, const MBBVector &,
-                                     const MBBVector &) {};
-      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {};
-    } else {
-      SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
-      ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs;
-      UsesCalleeSaves = true;
-    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -140,18 +129,17 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
               cl::desc("Warn for stack size bigger than the given"
                        " number"));
 
-INITIALIZE_TM_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion",
-                         false, false)
+INITIALIZE_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion", false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_TM_PASS_END(PEI, "prologepilog",
-                       "Prologue/Epilogue Insertion & Frame Finalization",
-                       false, false)
+INITIALIZE_PASS_END(PEI, "prologepilog",
+                    "Prologue/Epilogue Insertion & Frame Finalization", false,
+                    false)
 
-MachineFunctionPass *
-llvm::createPrologEpilogInserterPass(const TargetMachine *TM) {
-  return new PEI(TM);
+MachineFunctionPass *llvm::createPrologEpilogInserterPass() {
+  return new PEI();
 }
 
 STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
@@ -174,6 +162,20 @@ typedef SmallSetVector<int, 8> StackObjSet;
 /// frame indexes with appropriate references.
 ///
 bool PEI::runOnMachineFunction(MachineFunction &Fn) {
+  if (!SpillCalleeSavedRegisters) {
+    const TargetMachine &TM = Fn.getTarget();
+    if (!TM.usesPhysRegsForPEI()) {
+      SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
+                                     unsigned &, unsigned &, const MBBVector &,
+                                     const MBBVector &) {};
+      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {};
+    } else {
+      SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
+      ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs;
+      UsesCalleeSaves = true;
+    }
+  }
+
   const Function* F = Fn.getFunction();
   const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 1803ea2b9249..7b3a5d5c5ff7 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -2666,11 +2666,17 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
   // Look for values being erased.
   bool DidPrune = false;
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
-    if (Vals[i].Resolution != CR_Erase)
+    // We should trigger in all cases in which eraseInstrs() does something.
+    // match what eraseInstrs() is doing, print a message so
+    if (Vals[i].Resolution != CR_Erase &&
+        (Vals[i].Resolution != CR_Keep || !Vals[i].ErasableImplicitDef ||
+         !Vals[i].Pruned))
       continue;
 
     // Check subranges at the point where the copy will be removed.
     SlotIndex Def = LR.getValNumInfo(i)->def;
+    // Print message so mismatches with eraseInstrs() can be diagnosed.
+    DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def << '\n');
     for (LiveInterval::SubRange &S : LI.subranges()) {
       LiveQueryResult Q = S.Query(Def);
 
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 08b3d345f689..2771fdbd737a 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -767,13 +768,12 @@ class SafeStackLegacyPass : public FunctionPass {
 
 public:
   static char ID; // Pass identification, replacement for typeid..
-  SafeStackLegacyPass(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {
+  SafeStackLegacyPass() : FunctionPass(ID), TM(nullptr) {
     initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
-  SafeStackLegacyPass() : SafeStackLegacyPass(nullptr) {}
-
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
   }
@@ -793,8 +793,7 @@ public:
       return false;
     }
 
-    if (!TM)
-      report_fatal_error("Target machine is required");
+    TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
     auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
     if (!TL)
       report_fatal_error("TargetLowering instance is required");
@@ -821,11 +820,10 @@ public:
 } // anonymous namespace
 
 char SafeStackLegacyPass::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
-                         "Safe Stack instrumentation pass", false, false)
-INITIALIZE_TM_PASS_END(SafeStackLegacyPass, "safe-stack",
-                       "Safe Stack instrumentation pass", false, false)
+INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
+                      "Safe Stack instrumentation pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(SafeStackLegacyPass, "safe-stack",
+                    "Safe Stack instrumentation pass", false, false)
 
-FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
-  return new SafeStackLegacyPass(TM);
-}
+FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); }
diff --git a/lib/CodeGen/SafeStackColoring.cpp b/lib/CodeGen/SafeStackColoring.cpp
index 09289f947dc9..21f2fa497233 100644
--- a/lib/CodeGen/SafeStackColoring.cpp
+++ b/lib/CodeGen/SafeStackColoring.cpp
@@ -20,9 +20,10 @@ using namespace llvm::safestack;
 
 #define DEBUG_TYPE "safestackcoloring"
 
+// Disabled by default due to PR32143.
 static cl::opt<bool> ClColoring("safe-stack-coloring",
                                 cl::desc("enable safe stack coloring"),
-                                cl::Hidden, cl::init(true));
+                                cl::Hidden, cl::init(false));
 
 const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
   const auto IT = AllocaNumbering.find(AI);
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0ccee175abfb..5d450e7e078c 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2138,6 +2138,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
   if (isNullConstant(CarryIn))
     return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
 
+  // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
+  if (isNullConstant(N0) && isNullConstant(N1)) {
+    EVT VT = N0.getValueType();
+    EVT CarryVT = CarryIn.getValueType();
+    SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
+    AddToWorklist(CarryExt.getNode());
+    return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
+                                    DAG.getConstant(1, DL, VT)),
+                     DAG.getConstant(0, DL, CarryVT));
+  }
+
   if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
     return Combined;
 
@@ -12533,49 +12544,54 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
   // mergeable cases. To prevent this, we prune such stores from the
   // front of StoreNodes here.
 
-  unsigned StartIdx = 0;
-  while ((StartIdx + 1 < StoreNodes.size()) &&
-         StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
-             StoreNodes[StartIdx + 1].OffsetFromBase)
-    ++StartIdx;
-
-  // Bail if we don't have enough candidates to merge.
-  if (StartIdx + 1 >= StoreNodes.size())
-    return false;
-
-  if (StartIdx)
-    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
+  bool RV = false;
+  while (StoreNodes.size() > 1) {
+    unsigned StartIdx = 0;
+    while ((StartIdx + 1 < StoreNodes.size()) &&
+           StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
+               StoreNodes[StartIdx + 1].OffsetFromBase)
+      ++StartIdx;
 
-  // Scan the memory operations on the chain and find the first non-consecutive
-  // store memory address.
-  unsigned NumConsecutiveStores = 0;
-  int64_t StartAddress = StoreNodes[0].OffsetFromBase;
-
-  // Check that the addresses are consecutive starting from the second
-  // element in the list of stores.
-  for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
-    int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
-    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-      break;
-    NumConsecutiveStores = i + 1;
-  }
+    // Bail if we don't have enough candidates to merge.
+    if (StartIdx + 1 >= StoreNodes.size())
+      return RV;
 
-  if (NumConsecutiveStores < 2)
-    return false;
+    if (StartIdx)
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
+
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive store memory address.
+    unsigned NumConsecutiveStores = 1;
+    int64_t StartAddress = StoreNodes[0].OffsetFromBase;
+    // Check that the addresses are consecutive starting from the second
+    // element in the list of stores.
+    for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
+      int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+      NumConsecutiveStores = i + 1;
+    }
 
-  // Check that we can merge these candidates without causing a cycle
-  if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumConsecutiveStores))
-    return false;
+    if (NumConsecutiveStores < 2) {
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumConsecutiveStores);
+      continue;
+    }
 
+    // Check that we can merge these candidates without causing a cycle
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes,
+                                                  NumConsecutiveStores)) {
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumConsecutiveStores);
+      continue;
+    }
 
-  // The node with the lowest store address.
-  LLVMContext &Context = *DAG.getContext();
-  const DataLayout &DL = DAG.getDataLayout();
+    // The node with the lowest store address.
+    LLVMContext &Context = *DAG.getContext();
+    const DataLayout &DL = DAG.getDataLayout();
 
-  // Store the constants into memory as one consecutive store.
-  if (IsConstantSrc) {
-    bool RV = false;
-    while (NumConsecutiveStores > 1) {
+    // Store the constants into memory as one consecutive store.
+    if (IsConstantSrc) {
       LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
       unsigned FirstStoreAS = FirstInChain->getAddressSpace();
       unsigned FirstStoreAlign = FirstInChain->getAlignment();
@@ -12635,33 +12651,33 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       }
 
       // Check if we found a legal integer type that creates a meaningful merge.
-      if (LastLegalType < 2 && LastLegalVectorType < 2)
-        break;
+      if (LastLegalType < 2 && LastLegalVectorType < 2) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+        continue;
+      }
 
       bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
       unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
 
       bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
                                                     true, UseVector);
-      if (!Merged)
-        break;
+      if (!Merged) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+        continue;
+      }
       // Remove merged stores for next iteration.
-      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
       RV = true;
-      NumConsecutiveStores -= NumElem;
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      continue;
     }
-    return RV;
-  }
 
-  // When extracting multiple vector elements, try to store them
-  // in one vector store rather than a sequence of scalar stores.
-  if (IsExtractVecSrc) {
-    bool RV = false;
-    while (StoreNodes.size() >= 2) {
+    // When extracting multiple vector elements, try to store them
+    // in one vector store rather than a sequence of scalar stores.
+    if (IsExtractVecSrc) {
       LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
       unsigned FirstStoreAS = FirstInChain->getAddressSpace();
       unsigned FirstStoreAlign = FirstInChain->getAlignment();
-      unsigned NumStoresToMerge = 0;
+      unsigned NumStoresToMerge = 1;
       bool IsVec = MemVT.isVector();
       for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
         StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
@@ -12673,7 +12689,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
         // handles consecutive loads).
         if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
             StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
-          return false;
+          return RV;
 
         // Find a legal type for the vector store.
         unsigned Elts = i + 1;
@@ -12693,187 +12709,205 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
 
       bool Merged = MergeStoresOfConstantsOrVecElts(
           StoreNodes, MemVT, NumStoresToMerge, false, true);
-      if (!Merged)
-        break;
+      if (!Merged) {
+        StoreNodes.erase(StoreNodes.begin(),
+                         StoreNodes.begin() + NumStoresToMerge);
+        continue;
+      }
       // Remove merged stores for next iteration.
       StoreNodes.erase(StoreNodes.begin(),
                        StoreNodes.begin() + NumStoresToMerge);
       RV = true;
-      NumConsecutiveStores -= NumStoresToMerge;
+      continue;
     }
-    return RV;
-  }
 
-  // Below we handle the case of multiple consecutive stores that
-  // come from multiple consecutive loads. We merge them into a single
-  // wide load and a single wide store.
+    // Below we handle the case of multiple consecutive stores that
+    // come from multiple consecutive loads. We merge them into a single
+    // wide load and a single wide store.
 
-  // Look for load nodes which are used by the stored values.
-  SmallVector<MemOpLink, 8> LoadNodes;
+    // Look for load nodes which are used by the stored values.
+    SmallVector<MemOpLink, 8> LoadNodes;
 
-  // Find acceptable loads. Loads need to have the same chain (token factor),
-  // must not be zext, volatile, indexed, and they must be consecutive.
-  BaseIndexOffset LdBasePtr;
-  for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
-    StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-    LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
-    if (!Ld) break;
-
-    // Loads must only have one use.
-    if (!Ld->hasNUsesOfValue(1, 0))
-      break;
+    // Find acceptable loads. Loads need to have the same chain (token factor),
+    // must not be zext, volatile, indexed, and they must be consecutive.
+    BaseIndexOffset LdBasePtr;
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
+      if (!Ld)
+        break;
 
-    // The memory operands must not be volatile.
-    if (Ld->isVolatile() || Ld->isIndexed())
-      break;
+      // Loads must only have one use.
+      if (!Ld->hasNUsesOfValue(1, 0))
+        break;
 
-    // We do not accept ext loads.
-    if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
-      break;
+      // The memory operands must not be volatile.
+      if (Ld->isVolatile() || Ld->isIndexed())
+        break;
 
-    // The stored memory type must be the same.
-    if (Ld->getMemoryVT() != MemVT)
-      break;
+      // We do not accept ext loads.
+      if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
+        break;
 
-    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
-    // If this is not the first ptr that we check.
-    if (LdBasePtr.Base.getNode()) {
-      // The base ptr must be the same.
-      if (!LdPtr.equalBaseIndex(LdBasePtr))
+      // The stored memory type must be the same.
+      if (Ld->getMemoryVT() != MemVT)
         break;
-    } else {
-      // Check that all other base pointers are the same as this one.
-      LdBasePtr = LdPtr;
+
+      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+      // If this is not the first ptr that we check.
+      if (LdBasePtr.Base.getNode()) {
+        // The base ptr must be the same.
+        if (!LdPtr.equalBaseIndex(LdBasePtr))
+          break;
+      } else {
+        // Check that all other base pointers are the same as this one.
+        LdBasePtr = LdPtr;
+      }
+
+      // We found a potential memory operand to merge.
+      LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));
     }
 
-    // We found a potential memory operand to merge.
-    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));
-  }
+    if (LoadNodes.size() < 2) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+      continue;
+    }
 
-  if (LoadNodes.size() < 2)
-    return false;
+    // If we have load/store pair instructions and we only have two values,
+    // don't bother.
+    unsigned RequiredAlignment;
+    if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+        St->getAlignment() >= RequiredAlignment) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
+      continue;
+    }
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+    unsigned FirstLoadAS = FirstLoad->getAddressSpace();
+    unsigned FirstLoadAlign = FirstLoad->getAlignment();
+
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive load memory address. These variables hold the index in
+    // the store node array.
+    unsigned LastConsecutiveLoad = 0;
+    // This variable refers to the size and not index in the array.
+    unsigned LastLegalVectorType = 0;
+    unsigned LastLegalIntegerType = 0;
+    StartAddress = LoadNodes[0].OffsetFromBase;
+    SDValue FirstChain = FirstLoad->getChain();
+    for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+      // All loads must share the same chain.
+      if (LoadNodes[i].MemNode->getChain() != FirstChain)
+        break;
 
-  // If we have load/store pair instructions and we only have two values,
-  // don't bother.
-  unsigned RequiredAlignment;
-  if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
-      St->getAlignment() >= RequiredAlignment)
-    return false;
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-  unsigned FirstStoreAlign = FirstInChain->getAlignment();
-  LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
-  unsigned FirstLoadAS = FirstLoad->getAddressSpace();
-  unsigned FirstLoadAlign = FirstLoad->getAlignment();
-
-  // Scan the memory operations on the chain and find the first non-consecutive
-  // load memory address. These variables hold the index in the store node
-  // array.
-  unsigned LastConsecutiveLoad = 0;
-  // This variable refers to the size and not index in the array.
-  unsigned LastLegalVectorType = 0;
-  unsigned LastLegalIntegerType = 0;
-  StartAddress = LoadNodes[0].OffsetFromBase;
-  SDValue FirstChain = FirstLoad->getChain();
-  for (unsigned i = 1; i < LoadNodes.size(); ++i) {
-    // All loads must share the same chain.
-    if (LoadNodes[i].MemNode->getChain() != FirstChain)
-      break;
+      int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+      LastConsecutiveLoad = i;
+      // Find a legal type for the vector store.
+      EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1);
+      bool IsFastSt, IsFastLd;
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign, &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                 FirstLoadAlign, &IsFastLd) &&
+          IsFastLd) {
+        LastLegalVectorType = i + 1;
+      }
 
-    int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
-    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-      break;
-    LastConsecutiveLoad = i;
-    // Find a legal type for the vector store.
-    EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1);
-    bool IsFastSt, IsFastLd;
-    if (TLI.isTypeLegal(StoreTy) &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
-                               FirstLoadAlign, &IsFastLd) && IsFastLd) {
-      LastLegalVectorType = i + 1;
-    }
-
-    // Find a legal type for the integer store.
-    unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-    StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-    if (TLI.isTypeLegal(StoreTy) &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
-                               FirstLoadAlign, &IsFastLd) && IsFastLd)
-      LastLegalIntegerType = i + 1;
-    // Or check whether a truncstore and extload is legal.
-    else if (TLI.getTypeAction(Context, StoreTy) ==
-             TargetLowering::TypePromoteInteger) {
-      EVT LegalizedStoredValueTy =
-        TLI.getTypeToTransformTo(Context, StoreTy);
-      if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                 FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
+      // Find a legal type for the integer store.
+      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+      StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign, &IsFastSt) &&
           IsFastSt &&
-          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                 FirstLoadAS, FirstLoadAlign, &IsFastLd) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                 FirstLoadAlign, &IsFastLd) &&
           IsFastLd)
-        LastLegalIntegerType = i+1;
+        LastLegalIntegerType = i + 1;
+      // Or check whether a truncstore and extload is legal.
+      else if (TLI.getTypeAction(Context, StoreTy) ==
+               TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
+        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy,
+                               StoreTy) &&
+            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
+                               StoreTy) &&
+            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
+            IsFastSt &&
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstLoadAS, FirstLoadAlign, &IsFastLd) &&
+            IsFastLd)
+          LastLegalIntegerType = i + 1;
+      }
     }
-  }
 
-  // Only use vector types if the vector type is larger than the integer type.
-  // If they are the same, use integers.
-  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
-  unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType);
+    // Only use vector types if the vector type is larger than the integer type.
+    // If they are the same, use integers.
+    bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
+    unsigned LastLegalType =
+        std::max(LastLegalVectorType, LastLegalIntegerType);
 
-  // We add +1 here because the LastXXX variables refer to location while
-  // the NumElem refers to array/index size.
-  unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
-  NumElem = std::min(LastLegalType, NumElem);
+    // We add +1 here because the LastXXX variables refer to location while
+    // the NumElem refers to array/index size.
+    unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
+    NumElem = std::min(LastLegalType, NumElem);
 
-  if (NumElem < 2)
-    return false;
+    if (NumElem < 2) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+      continue;
+    }
 
-  // Find if it is better to use vectors or integers to load and store
-  // to memory.
-  EVT JointMemOpVT;
-  if (UseVectorTy) {
-    JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
-  } else {
-    unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-    JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
-  }
+    // Find if it is better to use vectors or integers to load and store
+    // to memory.
+    EVT JointMemOpVT;
+    if (UseVectorTy) {
+      JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
+    } else {
+      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+      JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+    }
 
-  SDLoc LoadDL(LoadNodes[0].MemNode);
-  SDLoc StoreDL(StoreNodes[0].MemNode);
+    SDLoc LoadDL(LoadNodes[0].MemNode);
+    SDLoc StoreDL(StoreNodes[0].MemNode);
 
-  // The merged loads are required to have the same incoming chain, so
-  // using the first's chain is acceptable.
-  SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
-                                FirstLoad->getBasePtr(),
-                                FirstLoad->getPointerInfo(), FirstLoadAlign);
+    // The merged loads are required to have the same incoming chain, so
+    // using the first's chain is acceptable.
+    SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
+                                  FirstLoad->getBasePtr(),
+                                  FirstLoad->getPointerInfo(), FirstLoadAlign);
 
-  SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+    SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
 
-  AddToWorklist(NewStoreChain.getNode());
+    AddToWorklist(NewStoreChain.getNode());
 
-  SDValue NewStore =
-      DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
-                   FirstInChain->getPointerInfo(), FirstStoreAlign);
+    SDValue NewStore = DAG.getStore(
+        NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
+        FirstInChain->getPointerInfo(), FirstStoreAlign);
 
-  // Transfer chain users from old loads to the new load.
-  for (unsigned i = 0; i < NumElem; ++i) {
-    LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                  SDValue(NewLoad.getNode(), 1));
-  }
+    // Transfer chain users from old loads to the new load.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                    SDValue(NewLoad.getNode(), 1));
+    }
 
-  // Replace the all stores with the new store.
-  for (unsigned i = 0; i < NumElem; ++i)
-    CombineTo(StoreNodes[i].MemNode, NewStore);
-  return true;
+    // Replace the all stores with the new store.
+    for (unsigned i = 0; i < NumElem; ++i)
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+    RV = true;
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+    continue;
+  }
+  return RV;
 }
 
 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 057badcd6b74..16c1f78f1b35 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4685,9 +4685,10 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
 /// used when a memcpy is turned into a memset when the source is a constant
 /// string ptr.
 static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
-                                  const TargetLowering &TLI, StringRef Str) {
+                                  const TargetLowering &TLI,
+                                  const ConstantDataArraySlice &Slice) {
   // Handle vector with all elements zero.
-  if (Str.empty()) {
+  if (Slice.Array == nullptr) {
     if (VT.isInteger())
       return DAG.getConstant(0, dl, VT);
     else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
@@ -4706,15 +4707,15 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
   assert(!VT.isVector() && "Can't handle vector type here!");
   unsigned NumVTBits = VT.getSizeInBits();
   unsigned NumVTBytes = NumVTBits / 8;
-  unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
+  unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));
 
   APInt Val(NumVTBits, 0);
   if (DAG.getDataLayout().isLittleEndian()) {
     for (unsigned i = 0; i != NumBytes; ++i)
-      Val |= (uint64_t)(unsigned char)Str[i] << i*8;
+      Val |= (uint64_t)(unsigned char)Slice[i] << i*8;
   } else {
     for (unsigned i = 0; i != NumBytes; ++i)
-      Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
+      Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
   }
 
   // If the "cost" of materializing the integer immediate is less than the cost
@@ -4731,9 +4732,8 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
   return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
 }
 
-/// isMemSrcFromString - Returns true if memcpy source is a string constant.
-///
-static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
+/// Returns true if memcpy source is constant data.
+static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
   uint64_t SrcDelta = 0;
   GlobalAddressSDNode *G = nullptr;
   if (Src.getOpcode() == ISD::GlobalAddress)
@@ -4747,8 +4747,8 @@ static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
   if (!G)
     return false;
 
-  return getConstantStringInfo(G->getGlobal(), Str,
-                               SrcDelta + G->getOffset(), false);
+  return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
+                                  SrcDelta + G->getOffset());
 }
 
 /// Determines the optimal series of memory ops to replace the memset / memcpy.
@@ -4891,15 +4891,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   if (Align > SrcAlign)
     SrcAlign = Align;
-  StringRef Str;
-  bool CopyFromStr = isMemSrcFromString(Src, Str);
-  bool isZeroStr = CopyFromStr && Str.empty();
+  ConstantDataArraySlice Slice;
+  bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
+  bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
-                                (isZeroStr ? 0 : SrcAlign),
-                                false, false, CopyFromStr, true,
+                                (isZeroConstant ? 0 : SrcAlign),
+                                false, false, CopyFromConstant, true,
                                 DstPtrInfo.getAddrSpace(),
                                 SrcPtrInfo.getAddrSpace(),
                                 DAG, TLI))
@@ -4943,18 +4943,29 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       DstOff -= VTSize - Size;
     }
 
-    if (CopyFromStr &&
-        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
+    if (CopyFromConstant &&
+        (isZeroConstant || (VT.isInteger() && !VT.isVector()))) {
       // It's unlikely a store of a vector immediate can be done in a single
       // instruction. It would require a load from a constantpool first.
       // We only handle zero vectors here.
       // FIXME: Handle other cases where store of vector immediate is done in
       // a single instruction.
-      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
+      ConstantDataArraySlice SubSlice;
+      if (SrcOff < Slice.Length) {
+        SubSlice = Slice;
+        SubSlice.move(SrcOff);
+      } else {
+        // This is an out-of-bounds access and hence UB. Pretend we read zero.
+        SubSlice.Array = nullptr;
+        SubSlice.Offset = 0;
+        SubSlice.Length = VTSize;
+      }
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
       if (Value.getNode())
         Store = DAG.getStore(Chain, dl, Value,
                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-                             DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
+                             DstPtrInfo.getWithOffset(DstOff), Align,
+                             MMOFlags);
     }
 
     if (!Store.getNode()) {
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index c0a5041b1395..1c66649cae01 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -110,8 +110,8 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType,
          Builder.FuncInfo.StatepointStackSlots.size() &&
          "Broken invariant");
 
-  StatepointMaxSlotsRequired = std::max<unsigned long>(
-      StatepointMaxSlotsRequired, Builder.FuncInfo.StatepointStackSlots.size());
+  StatepointMaxSlotsRequired.updateMax(
+      Builder.FuncInfo.StatepointStackSlots.size());
 
   return SpillSlot;
 }
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index a8aafe78748d..5da77264261b 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -58,12 +58,13 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
                                           cl::init(true), cl::Hidden);
 
 char StackProtector::ID = 0;
-INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors",
-                   false, true)
+INITIALIZE_PASS_BEGIN(StackProtector, "stack-protector",
+                      "Insert stack protectors", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(StackProtector, "stack-protector",
+                    "Insert stack protectors", false, true)
 
-FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
-  return new StackProtector(TM);
-}
+FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); }
 
 StackProtector::SSPLayoutKind
 StackProtector::getSSPLayout(const AllocaInst *AI) const {
@@ -97,6 +98,8 @@ bool StackProtector::runOnFunction(Function &Fn) {
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  Trip = TM->getTargetTriple();
   TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
   HasPrologue = false;
   HasIRCheck = false;
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 9724cb074584..83348058eca9 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -315,7 +315,9 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 TargetPassConfig::TargetPassConfig()
   : ImmutablePass(ID), PM(nullptr) {
-  llvm_unreachable("TargetPassConfig should not be constructed on-the-fly");
+  report_fatal_error("Trying to construct TargetPassConfig without a target "
+                     "machine. Scheduling a CodeGen pass without a target "
+                     "triple set?");
 }
 
 // Helper to verify the analysis is really immutable.
@@ -514,14 +516,14 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     LLVM_FALLTHROUGH;
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-    addPass(createDwarfEHPass(TM));
+    addPass(createDwarfEHPass());
     break;
   case ExceptionHandling::WinEH:
     // We support using both GCC-style and MSVC-style exceptions on Windows, so
     // add both preparation passes. Each pass will only actually run if it
     // recognizes the personality function.
-    addPass(createWinEHPass(TM));
-    addPass(createDwarfEHPass(TM));
+    addPass(createWinEHPass());
+    addPass(createDwarfEHPass());
     break;
   case ExceptionHandling::None:
     addPass(createLowerInvokePass());
@@ -536,7 +538,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
 /// before exception handling preparation passes.
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
-    addPass(createCodeGenPreparePass(TM));
+    addPass(createCodeGenPreparePass());
   addPass(createRewriteSymbolsPass());
 }
 
@@ -551,8 +553,8 @@ void TargetPassConfig::addISelPrepare() {
 
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
-  addPass(createSafeStackPass(TM));
-  addPass(createStackProtectorPass(TM));
+  addPass(createSafeStackPass());
+  addPass(createStackProtectorPass());
 
   if (PrintISelInput)
     addPass(createPrintFunctionPass(
@@ -623,9 +625,6 @@ void TargetPassConfig::addMachinePasses() {
     addPass(&LocalStackSlotAllocationID, false);
   }
 
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(&LiveRangeShrinkID);
-
   // Run pre-ra passes.
   addPreRegAlloc();
 
@@ -650,7 +649,7 @@ void TargetPassConfig::addMachinePasses() {
   // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
   // do so if it hasn't been disabled, substituted, or overridden.
   if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
-      addPass(createPrologEpilogInserterPass(TM));
+      addPass(createPrologEpilogInserterPass());
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index ae07e8b2fa03..a632b40c20f5 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -54,7 +54,7 @@ namespace {
 class WinEHPrepare : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
-  WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {}
+  WinEHPrepare() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &Fn) override;
 
@@ -94,12 +94,10 @@ private:
 } // end anonymous namespace
 
 char WinEHPrepare::ID = 0;
-INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
-                   false, false)
+INITIALIZE_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
+                false, false)
 
-FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) {
-  return new WinEHPrepare(TM);
-}
+FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); }
 
 bool WinEHPrepare::runOnFunction(Function &Fn) {
   if (!Fn.hasPersonalityFn())
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 8d9353ae5f5e..556ebf78622f 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -2,10 +2,10 @@ add_llvm_library(LLVMDebugInfoCodeView
   CodeViewError.cpp
   CodeViewRecordIO.cpp
   CVSymbolVisitor.cpp
-  CVTypeDumper.cpp
   CVTypeVisitor.cpp
   EnumTables.cpp
   Formatters.cpp
+  LazyRandomTypeCollection.cpp
   Line.cpp
   ModuleDebugFileChecksumFragment.cpp
   ModuleDebugFragment.cpp
@@ -13,7 +13,6 @@ add_llvm_library(LLVMDebugInfoCodeView
   ModuleDebugFragmentVisitor.cpp
   ModuleDebugInlineeLinesFragment.cpp
   ModuleDebugLineFragment.cpp
-  RandomAccessTypeVisitor.cpp
   RecordSerialization.cpp
   StringTable.cpp
   SymbolRecordMapping.cpp
@@ -22,10 +21,12 @@ add_llvm_library(LLVMDebugInfoCodeView
   TypeDatabase.cpp
   TypeDatabaseVisitor.cpp
   TypeDumpVisitor.cpp
+  TypeIndex.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
   TypeStreamMerger.cpp
-
+  TypeTableCollection.cpp
+  
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/CodeView
   )
diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
deleted file mode 100644
index 02e1682f76e7..000000000000
--- a/lib/DebugInfo/CodeView/CVTypeDumper.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//===-- CVTypeDumper.cpp - CodeView type info dumper ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/Support/BinaryByteStream.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
-  TypeDatabaseVisitor DBV(TypeDB);
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(DBV);
-  Pipeline.addCallbackToPipeline(Dumper);
-
-  CVType RecordCopy = Record;
-  return codeview::visitTypeRecord(RecordCopy, Pipeline, VDS_BytesPresent,
-                                   Handler);
-}
-
-Error CVTypeDumper::dump(const CVTypeArray &Types,
-                         TypeVisitorCallbacks &Dumper) {
-  TypeDatabaseVisitor DBV(TypeDB);
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(DBV);
-  Pipeline.addCallbackToPipeline(Dumper);
-
-  return codeview::visitTypeStream(Types, Pipeline, Handler);
-}
-
-Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
-  BinaryByteStream Stream(Data, llvm::support::little);
-  CVTypeArray Types;
-  BinaryStreamReader Reader(Stream);
-  if (auto EC = Reader.readArray(Types, Reader.getLength()))
-    return EC;
-
-  return dump(Types, Dumper);
-}
-
-void CVTypeDumper::printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
-                                  TypeIndex TI, TypeDatabase &DB) {
-  StringRef TypeName;
-  if (!TI.isNoneType())
-    TypeName = DB.getTypeName(TI);
-  if (!TypeName.empty())
-    Printer.printHex(FieldName, TypeName, TI.getIndex());
-  else
-    Printer.printHex(FieldName, TI.getIndex());
-}
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 0f7f5f667790..f95c3e79388e 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -9,7 +9,9 @@
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
@@ -22,8 +24,6 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
-    : Callbacks(Callbacks) {}
 
 template <typename T>
 static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
@@ -66,6 +66,67 @@ static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
   return R;
 }
 
+static Error visitMemberRecord(CVMemberRecord &Record,
+                               TypeVisitorCallbacks &Callbacks) {
+  if (auto EC = Callbacks.visitMemberBegin(Record))
+    return EC;
+
+  switch (Record.Kind) {
+  default:
+    if (auto EC = Callbacks.visitUnknownMember(Record))
+      return EC;
+    break;
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  case EnumName: {                                                             \
+    if (auto EC = visitKnownMember<Name##Record>(Record, Callbacks))           \
+      return EC;                                                               \
+    break;                                                                     \
+  }
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                \
+  MEMBER_RECORD(EnumVal, EnumVal, AliasName)
+#define TYPE_RECORD(EnumName, EnumVal, Name)
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+  }
+
+  if (auto EC = Callbacks.visitMemberEnd(Record))
+    return EC;
+
+  return Error::success();
+}
+
+namespace {
+
+class CVTypeVisitor {
+public:
+  explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks);
+
+  void addTypeServerHandler(TypeServerHandler &Handler);
+
+  Error visitTypeRecord(CVType &Record, TypeIndex Index);
+  Error visitTypeRecord(CVType &Record);
+
+  /// Visits the type records in Data. Sets the error flag on parse failures.
+  Error visitTypeStream(const CVTypeArray &Types);
+  Error visitTypeStream(CVTypeRange Types);
+  Error visitTypeStream(TypeCollection &Types);
+
+  Error visitMemberRecord(CVMemberRecord Record);
+  Error visitFieldListMemberStream(BinaryStreamReader &Stream);
+
+private:
+  Expected<bool> handleTypeServer(CVType &Record);
+  Error finishVisitation(CVType &Record);
+
+  /// The interface to the class that gets notified of each visitation.
+  TypeVisitorCallbacks &Callbacks;
+
+  TinyPtrVector<TypeServerHandler *> Handlers;
+};
+
+CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
+    : Callbacks(Callbacks) {}
+
 void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
   Handlers.push_back(&Handler);
 }
@@ -144,35 +205,6 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
   return finishVisitation(Record);
 }
 
-static Error visitMemberRecord(CVMemberRecord &Record,
-                               TypeVisitorCallbacks &Callbacks) {
-  if (auto EC = Callbacks.visitMemberBegin(Record))
-    return EC;
-
-  switch (Record.Kind) {
-  default:
-    if (auto EC = Callbacks.visitUnknownMember(Record))
-      return EC;
-    break;
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  case EnumName: {                                                             \
-    if (auto EC = visitKnownMember<Name##Record>(Record, Callbacks))           \
-      return EC;                                                               \
-    break;                                                                     \
-  }
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                \
-  MEMBER_RECORD(EnumVal, EnumVal, AliasName)
-#define TYPE_RECORD(EnumName, EnumVal, Name)
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
-  }
-
-  if (auto EC = Callbacks.visitMemberEnd(Record))
-    return EC;
-
-  return Error::success();
-}
-
 Error CVTypeVisitor::visitMemberRecord(CVMemberRecord Record) {
   return ::visitMemberRecord(Record, Callbacks);
 }
@@ -194,12 +226,18 @@ Error CVTypeVisitor::visitTypeStream(CVTypeRange Types) {
   return Error::success();
 }
 
-Error CVTypeVisitor::visitFieldListMemberStream(BinaryStreamReader Reader) {
-  FieldListDeserializer Deserializer(Reader);
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(Callbacks);
+Error CVTypeVisitor::visitTypeStream(TypeCollection &Types) {
+  Optional<TypeIndex> I = Types.getFirst();
+  while (I) {
+    CVType Type = Types.getType(*I);
+    if (auto EC = visitTypeRecord(Type, *I))
+      return EC;
+    I = Types.getNext(*I);
+  }
+  return Error::success();
+}
 
+Error CVTypeVisitor::visitFieldListMemberStream(BinaryStreamReader &Reader) {
   TypeLeafKind Leaf;
   while (!Reader.empty()) {
     if (auto EC = Reader.readEnum(Leaf))
@@ -207,20 +245,13 @@ Error CVTypeVisitor::visitFieldListMemberStream(BinaryStreamReader Reader) {
 
     CVMemberRecord Record;
     Record.Kind = Leaf;
-    if (auto EC = ::visitMemberRecord(Record, Pipeline))
+    if (auto EC = ::visitMemberRecord(Record, Callbacks))
       return EC;
   }
 
   return Error::success();
 }
 
-Error CVTypeVisitor::visitFieldListMemberStream(ArrayRef<uint8_t> Data) {
-  BinaryByteStream S(Data, llvm::support::little);
-  BinaryStreamReader SR(S);
-  return visitFieldListMemberStream(SR);
-}
-
-namespace {
 struct FieldListVisitHelper {
   FieldListVisitHelper(TypeVisitorCallbacks &Callbacks, ArrayRef<uint8_t> Data,
                        VisitorDataSource Source)
@@ -241,11 +272,8 @@ struct FieldListVisitHelper {
 };
 
 struct VisitHelper {
-  VisitHelper(TypeVisitorCallbacks &Callbacks, VisitorDataSource Source,
-              TypeServerHandler *TS)
+  VisitHelper(TypeVisitorCallbacks &Callbacks, VisitorDataSource Source)
       : Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) {
-    if (TS)
-      Visitor.addTypeServerHandler(*TS);
     if (Source == VDS_BytesPresent) {
       Pipeline.addCallbackToPipeline(Deserializer);
       Pipeline.addCallbackToPipeline(Callbacks);
@@ -262,29 +290,57 @@ Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
                                       TypeVisitorCallbacks &Callbacks,
                                       VisitorDataSource Source,
                                       TypeServerHandler *TS) {
-  VisitHelper Helper(Callbacks, Source, TS);
-  return Helper.Visitor.visitTypeRecord(Record, Index);
+  VisitHelper V(Callbacks, Source);
+  if (TS)
+    V.Visitor.addTypeServerHandler(*TS);
+  return V.Visitor.visitTypeRecord(Record, Index);
 }
 
 Error llvm::codeview::visitTypeRecord(CVType &Record,
                                       TypeVisitorCallbacks &Callbacks,
                                       VisitorDataSource Source,
                                       TypeServerHandler *TS) {
-  VisitHelper Helper(Callbacks, Source, TS);
-  return Helper.Visitor.visitTypeRecord(Record);
+  VisitHelper V(Callbacks, Source);
+  if (TS)
+    V.Visitor.addTypeServerHandler(*TS);
+  return V.Visitor.visitTypeRecord(Record);
 }
 
-Error llvm::codeview::visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
-                                              TypeVisitorCallbacks &Callbacks) {
-  CVTypeVisitor Visitor(Callbacks);
-  return Visitor.visitFieldListMemberStream(FieldList);
+Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
+                                      TypeVisitorCallbacks &Callbacks,
+                                      TypeServerHandler *TS) {
+  VisitHelper V(Callbacks, VDS_BytesPresent);
+  if (TS)
+    V.Visitor.addTypeServerHandler(*TS);
+  return V.Visitor.visitTypeStream(Types);
+}
+
+Error llvm::codeview::visitTypeStream(CVTypeRange Types,
+                                      TypeVisitorCallbacks &Callbacks,
+                                      TypeServerHandler *TS) {
+  VisitHelper V(Callbacks, VDS_BytesPresent);
+  if (TS)
+    V.Visitor.addTypeServerHandler(*TS);
+  return V.Visitor.visitTypeStream(Types);
+}
+
+Error llvm::codeview::visitTypeStream(TypeCollection &Types,
+                                      TypeVisitorCallbacks &Callbacks,
+                                      TypeServerHandler *TS) {
+  // When the internal visitor calls Types.getType(Index) the interface is
+  // required to return a CVType with the bytes filled out.  So we can assume
+  // that the bytes will be present when individual records are visited.
+  VisitHelper V(Callbacks, VDS_BytesPresent);
+  if (TS)
+    V.Visitor.addTypeServerHandler(*TS);
+  return V.Visitor.visitTypeStream(Types);
 }
 
 Error llvm::codeview::visitMemberRecord(CVMemberRecord Record,
                                         TypeVisitorCallbacks &Callbacks,
                                         VisitorDataSource Source) {
-  FieldListVisitHelper Helper(Callbacks, Record.Data, Source);
-  return Helper.Visitor.visitMemberRecord(Record);
+  FieldListVisitHelper V(Callbacks, Record.Data, Source);
+  return V.Visitor.visitMemberRecord(Record);
 }
 
 Error llvm::codeview::visitMemberRecord(TypeLeafKind Kind,
@@ -296,16 +352,8 @@ Error llvm::codeview::visitMemberRecord(TypeLeafKind Kind,
   return visitMemberRecord(R, Callbacks, VDS_BytesPresent);
 }
 
-Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
-                                      TypeVisitorCallbacks &Callbacks,
-                                      TypeServerHandler *TS) {
-  VisitHelper Helper(Callbacks, VDS_BytesPresent, TS);
-  return Helper.Visitor.visitTypeStream(Types);
-}
-
-Error llvm::codeview::visitTypeStream(CVTypeRange Types,
-                                      TypeVisitorCallbacks &Callbacks,
-                                      TypeServerHandler *TS) {
-  VisitHelper Helper(Callbacks, VDS_BytesPresent, TS);
-  return Helper.Visitor.visitTypeStream(Types);
+Error llvm::codeview::visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
+                                              TypeVisitorCallbacks &Callbacks) {
+  FieldListVisitHelper V(Callbacks, FieldList, VDS_BytesPresent);
+  return V.Visitor.visitFieldListMemberStream(V.Reader);
 }
diff --git a/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
new file mode 100644
index 000000000000..39eb4099ce9e
--- /dev/null
+++ b/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -0,0 +1,229 @@
+//===- LazyRandomTypeCollection.cpp ---------------------------- *- C++--*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static void error(Error &&EC) {
+  assert(!static_cast<bool>(EC));
+  if (EC)
+    consumeError(std::move(EC));
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(uint32_t RecordCountHint)
+    : LazyRandomTypeCollection(CVTypeArray(), RecordCountHint,
+                               PartialOffsetArray()) {}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(
+    const CVTypeArray &Types, uint32_t RecordCountHint,
+    PartialOffsetArray PartialOffsets)
+    : Database(RecordCountHint), Types(Types), DatabaseVisitor(Database),
+      PartialOffsets(PartialOffsets) {
+  KnownOffsets.resize(Database.capacity());
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(ArrayRef<uint8_t> Data,
+                                                   uint32_t RecordCountHint)
+    : LazyRandomTypeCollection(RecordCountHint) {
+  reset(Data);
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(StringRef Data,
+                                                   uint32_t RecordCountHint)
+    : LazyRandomTypeCollection(
+          makeArrayRef(Data.bytes_begin(), Data.bytes_end()), RecordCountHint) {
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(const CVTypeArray &Types,
+                                                   uint32_t NumRecords)
+    : LazyRandomTypeCollection(Types, NumRecords, PartialOffsetArray()) {}
+
+void LazyRandomTypeCollection::reset(StringRef Data) {
+  reset(makeArrayRef(Data.bytes_begin(), Data.bytes_end()));
+}
+
+void LazyRandomTypeCollection::reset(ArrayRef<uint8_t> Data) {
+  PartialOffsets = PartialOffsetArray();
+
+  BinaryStreamReader Reader(Data, support::little);
+  error(Reader.readArray(Types, Reader.getLength()));
+
+  KnownOffsets.resize(Database.capacity());
+}
+
+CVType LazyRandomTypeCollection::getType(TypeIndex Index) {
+  error(ensureTypeExists(Index));
+  return Database.getTypeRecord(Index);
+}
+
+StringRef LazyRandomTypeCollection::getTypeName(TypeIndex Index) {
+  if (!Index.isSimple()) {
+    // Try to make sure the type exists.  Even if it doesn't though, it may be
+    // because we're dumping a symbol stream with no corresponding type stream
+    // present, in which case we still want to be able to print <unknown UDT>
+    // for the type names.
+    consumeError(ensureTypeExists(Index));
+  }
+
+  return Database.getTypeName(Index);
+}
+
+bool LazyRandomTypeCollection::contains(TypeIndex Index) {
+  return Database.contains(Index);
+}
+
+uint32_t LazyRandomTypeCollection::size() { return Database.size(); }
+
+uint32_t LazyRandomTypeCollection::capacity() { return Database.capacity(); }
+
+Error LazyRandomTypeCollection::ensureTypeExists(TypeIndex TI) {
+  if (!Database.contains(TI)) {
+    if (auto EC = visitRangeForType(TI))
+      return EC;
+  }
+  return Error::success();
+}
+
+Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
+  if (PartialOffsets.empty())
+    return fullScanForType(TI);
+
+  auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
+                               [](TypeIndex Value, const TypeIndexOffset &IO) {
+                                 return Value < IO.Type;
+                               });
+
+  assert(Next != PartialOffsets.begin());
+  auto Prev = std::prev(Next);
+
+  TypeIndex TIB = Prev->Type;
+  if (Database.contains(TIB)) {
+    // They've asked us to fetch a type index, but the entry we found in the
+    // partial offsets array has already been visited.  Since we visit an entire
+    // block every time, that means this record should have been previously
+    // discovered.  Ultimately, this means this is a request for a non-existant
+    // type index.
+    return make_error<CodeViewError>("Invalid type index");
+  }
+
+  TypeIndex TIE;
+  if (Next == PartialOffsets.end()) {
+    TIE = TypeIndex::fromArrayIndex(Database.capacity());
+  } else {
+    TIE = Next->Type;
+  }
+
+  if (auto EC = visitRange(TIB, Prev->Offset, TIE))
+    return EC;
+  return Error::success();
+}
+
+Optional<TypeIndex> LazyRandomTypeCollection::getFirst() {
+  TypeIndex TI = TypeIndex::fromArrayIndex(0);
+  if (auto EC = ensureTypeExists(TI)) {
+    consumeError(std::move(EC));
+    return None;
+  }
+  return TI;
+}
+
+Optional<TypeIndex> LazyRandomTypeCollection::getNext(TypeIndex Prev) {
+  // We can't be sure how long this type stream is, given that the initial count
+  // given to the constructor is just a hint.  So just try to make sure the next
+  // record exists, and if anything goes wrong, we must be at the end.
+  if (auto EC = ensureTypeExists(Prev + 1)) {
+    consumeError(std::move(EC));
+    return None;
+  }
+
+  return Prev + 1;
+}
+
+Error LazyRandomTypeCollection::fullScanForType(TypeIndex TI) {
+  assert(PartialOffsets.empty());
+
+  TypeIndex CurrentTI = TypeIndex::fromArrayIndex(0);
+  uint32_t Offset = 0;
+  auto Begin = Types.begin();
+
+  if (!Database.empty()) {
+    // In the case of type streams which we don't know the number of records of,
+    // it's possible to search for a type index triggering a full scan, but then
+    // later additional records are added since we didn't know how many there
+    // would be until we did a full visitation, then you try to access the new
+    // type triggering another full scan.  To avoid this, we assume that if the
+    // database has some records, this must be what's going on.  So we ask the
+    // database for the largest type index less than the one we're searching for
+    // and only do the forward scan from there.
+    auto Prev = Database.largestTypeIndexLessThan(TI);
+    assert(Prev.hasValue() && "Empty database with valid types?");
+    Offset = KnownOffsets[Prev->toArrayIndex()];
+    CurrentTI = *Prev;
+    ++CurrentTI;
+    Begin = Types.at(Offset);
+    ++Begin;
+    Offset = Begin.offset();
+  }
+
+  auto End = Types.end();
+  while (Begin != End) {
+    if (auto EC = visitOneRecord(CurrentTI, Offset, *Begin))
+      return EC;
+
+    Offset += Begin.getRecordLength();
+    ++Begin;
+    ++CurrentTI;
+  }
+  if (CurrentTI <= TI) {
+    return make_error<CodeViewError>("Type Index does not exist!");
+  }
+  return Error::success();
+}
+
+Error LazyRandomTypeCollection::visitRange(TypeIndex Begin,
+                                           uint32_t BeginOffset,
+                                           TypeIndex End) {
+
+  auto RI = Types.at(BeginOffset);
+  assert(RI != Types.end());
+
+  while (Begin != End) {
+    if (auto EC = visitOneRecord(Begin, BeginOffset, *RI))
+      return EC;
+
+    BeginOffset += RI.getRecordLength();
+    ++Begin;
+    ++RI;
+  }
+
+  return Error::success();
+}
+
+Error LazyRandomTypeCollection::visitOneRecord(TypeIndex TI, uint32_t Offset,
+                                               CVType &Record) {
+  assert(!Database.contains(TI));
+  if (auto EC = codeview::visitTypeRecord(Record, TI, DatabaseVisitor))
+    return EC;
+  // Keep the KnownOffsets array the same size as the Database's capacity. Since
+  // we don't always know how many records are in the type stream, we need to be
+  // prepared for the database growing and receicing a type index that can't fit
+  // in our current buffer.
+  if (KnownOffsets.size() < Database.capacity())
+    KnownOffsets.resize(Database.capacity());
+  KnownOffsets[TI.toArrayIndex()] = Offset;
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
deleted file mode 100644
index 704d1131108a..000000000000
--- a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//===- RandomAccessTypeVisitor.cpp ---------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
-
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-RandomAccessTypeVisitor::RandomAccessTypeVisitor(
-    const CVTypeArray &Types, uint32_t NumRecords,
-    PartialOffsetArray PartialOffsets)
-    : Database(NumRecords), Types(Types), DatabaseVisitor(Database),
-      PartialOffsets(PartialOffsets) {
-
-  KnownOffsets.resize(Database.capacity());
-}
-
-Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI,
-                                              TypeVisitorCallbacks &Callbacks) {
-  assert(TI.toArrayIndex() < Database.capacity());
-
-  if (!Database.contains(TI)) {
-    if (auto EC = visitRangeForType(TI))
-      return EC;
-  }
-
-  assert(Database.contains(TI));
-  auto &Record = Database.getTypeRecord(TI);
-  return codeview::visitTypeRecord(Record, TI, Callbacks);
-}
-
-Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) {
-  if (PartialOffsets.empty()) {
-    TypeIndex TIB(TypeIndex::FirstNonSimpleIndex);
-    TypeIndex TIE = TIB + Database.capacity();
-    return visitRange(TIB, 0, TIE);
-  }
-
-  auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
-                               [](TypeIndex Value, const TypeIndexOffset &IO) {
-                                 return Value < IO.Type;
-                               });
-
-  assert(Next != PartialOffsets.begin());
-  auto Prev = std::prev(Next);
-
-  TypeIndex TIB = Prev->Type;
-  TypeIndex TIE;
-  if (Next == PartialOffsets.end()) {
-    TIE = TypeIndex::fromArrayIndex(Database.capacity());
-  } else {
-    TIE = Next->Type;
-  }
-
-  if (auto EC = visitRange(TIB, Prev->Offset, TIE))
-    return EC;
-  return Error::success();
-}
-
-Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset,
-                                          TypeIndex End) {
-
-  auto RI = Types.at(BeginOffset);
-  assert(RI != Types.end());
-
-  while (Begin != End) {
-    assert(!Database.contains(Begin));
-    if (auto EC = codeview::visitTypeRecord(*RI, Begin, DatabaseVisitor))
-      return EC;
-    KnownOffsets[Begin.toArrayIndex()] = BeginOffset;
-
-    BeginOffset += RI.getRecordLength();
-    ++Begin;
-    ++RI;
-  }
-
-  return Error::success();
-}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 5395e4349b28..7d01c8c5f194 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -11,7 +11,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/StringTable.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
@@ -33,9 +32,9 @@ namespace {
 /// the visitor out of SymbolDumper.h.
 class CVSymbolDumperImpl : public SymbolVisitorCallbacks {
 public:
-  CVSymbolDumperImpl(TypeDatabase &TypeDB, SymbolDumpDelegate *ObjDelegate,
+  CVSymbolDumperImpl(TypeCollection &Types, SymbolDumpDelegate *ObjDelegate,
                      ScopedPrinter &W, bool PrintRecordBytes)
-      : TypeDB(TypeDB), ObjDelegate(ObjDelegate), W(W),
+      : Types(Types), ObjDelegate(ObjDelegate), W(W),
         PrintRecordBytes(PrintRecordBytes), InFunctionScope(false) {}
 
 /// CVSymbolVisitor overrides.
@@ -54,7 +53,7 @@ private:
   void printLocalVariableAddrGap(ArrayRef<LocalVariableAddrGap> Gaps);
   void printTypeIndex(StringRef FieldName, TypeIndex TI);
 
-  TypeDatabase &TypeDB;
+  TypeCollection &Types;
   SymbolDumpDelegate *ObjDelegate;
   ScopedPrinter &W;
 
@@ -83,7 +82,7 @@ void CVSymbolDumperImpl::printLocalVariableAddrGap(
 }
 
 void CVSymbolDumperImpl::printTypeIndex(StringRef FieldName, TypeIndex TI) {
-  CVTypeDumper::printTypeIndex(W, FieldName, TI, TypeDB);
+  codeview::printTypeIndex(W, FieldName, TI, Types);
 }
 
 Error CVSymbolDumperImpl::visitSymbolBegin(CVSymbol &CVR) {
@@ -670,7 +669,7 @@ Error CVSymbolDumperImpl::visitUnknownSymbol(CVSymbol &CVR) {
 Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
   SymbolVisitorCallbackPipeline Pipeline;
   SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
+  CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
@@ -681,7 +680,7 @@ Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
 Error CVSymbolDumper::dump(const CVSymbolArray &Symbols) {
   SymbolVisitorCallbackPipeline Pipeline;
   SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
+  CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index 7924440e5e29..af05d2dc294b 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -72,16 +72,20 @@ TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) {
 }
 
 TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) {
-  TypeIndex TI;
-  TI = getAppendIndex();
-  if (TI.toArrayIndex() >= capacity())
+  LargestTypeIndex = getAppendIndex();
+  if (LargestTypeIndex.toArrayIndex() >= capacity())
     grow();
-  recordType(Name, TI, Data);
-  return TI;
+  recordType(Name, LargestTypeIndex, Data);
+  return LargestTypeIndex;
 }
 
 void TypeDatabase::recordType(StringRef Name, TypeIndex Index,
                               const CVType &Data) {
+  LargestTypeIndex = empty() ? Index : std::max(Index, LargestTypeIndex);
+
+  if (LargestTypeIndex.toArrayIndex() >= capacity())
+    grow(Index);
+
   uint32_t AI = Index.toArrayIndex();
 
   assert(!contains(Index));
@@ -144,19 +148,66 @@ uint32_t TypeDatabase::size() const { return Count; }
 
 uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); }
 
-void TypeDatabase::grow() {
-  TypeRecords.emplace_back();
-  CVUDTNames.emplace_back();
-  ValidRecords.resize(ValidRecords.size() + 1);
+CVType TypeDatabase::getType(TypeIndex Index) { return getTypeRecord(Index); }
+
+StringRef TypeDatabase::getTypeName(TypeIndex Index) {
+  return static_cast<const TypeDatabase *>(this)->getTypeName(Index);
+}
+
+bool TypeDatabase::contains(TypeIndex Index) {
+  return static_cast<const TypeDatabase *>(this)->contains(Index);
+}
+
+uint32_t TypeDatabase::size() {
+  return static_cast<const TypeDatabase *>(this)->size();
+}
+
+uint32_t TypeDatabase::capacity() {
+  return static_cast<const TypeDatabase *>(this)->capacity();
+}
+
+void TypeDatabase::grow() { grow(LargestTypeIndex + 1); }
+
+void TypeDatabase::grow(TypeIndex NewIndex) {
+  uint32_t NewSize = NewIndex.toArrayIndex() + 1;
+
+  if (NewSize <= capacity())
+    return;
+
+  uint32_t NewCapacity = NewSize * 3 / 2;
+
+  TypeRecords.resize(NewCapacity);
+  CVUDTNames.resize(NewCapacity);
+  ValidRecords.resize(NewCapacity);
 }
 
 bool TypeDatabase::empty() const { return size() == 0; }
 
+Optional<TypeIndex> TypeDatabase::largestTypeIndexLessThan(TypeIndex TI) const {
+  uint32_t AI = TI.toArrayIndex();
+  int N = ValidRecords.find_prev(AI);
+  if (N == -1)
+    return None;
+  return TypeIndex::fromArrayIndex(N);
+}
+
 TypeIndex TypeDatabase::getAppendIndex() const {
   if (empty())
     return TypeIndex::fromArrayIndex(0);
 
-  int Index = ValidRecords.find_last();
-  assert(Index != -1);
-  return TypeIndex::fromArrayIndex(Index) + 1;
+  return LargestTypeIndex + 1;
+}
+
+Optional<TypeIndex> TypeDatabase::getFirst() {
+  int N = ValidRecords.find_first();
+  if (N == -1)
+    return None;
+  return TypeIndex::fromArrayIndex(N);
+}
+
+Optional<TypeIndex> TypeDatabase::getNext(TypeIndex Prev) {
+  int N = ValidRecords.find_next(Prev.toArrayIndex());
+  if (N == -1)
+    return None;
+  return TypeIndex::fromArrayIndex(N);
 }
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 9485c9cfedff..84f52a055815 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -10,15 +10,13 @@
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 
 #include "llvm/ADT/SmallString.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -165,16 +163,15 @@ static StringRef getLeafTypeName(TypeLeafKind LT) {
 }
 
 void TypeDumpVisitor::printTypeIndex(StringRef FieldName, TypeIndex TI) const {
-  CVTypeDumper::printTypeIndex(*W, FieldName, TI, TypeDB);
+  codeview::printTypeIndex(*W, FieldName, TI, TpiTypes);
 }
 
 void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
-  CVTypeDumper::printTypeIndex(*W, FieldName, TI, getSourceDB());
+  codeview::printTypeIndex(*W, FieldName, TI, getSourceTypes());
 }
 
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
-  TypeIndex TI = getSourceDB().getAppendIndex();
-  return visitTypeBegin(Record, TI);
+  return visitTypeBegin(Record, TypeIndex::fromArrayIndex(TpiTypes.size()));
 }
 
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
@@ -245,7 +242,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringListRecord &Strs) {
   W->printNumber("NumStrings", Size);
   ListScope Arguments(*W, "Strings");
   for (uint32_t I = 0; I < Size; ++I) {
-    printTypeIndex("String", Indices[I]);
+    printItemIndex("String", Indices[I]);
   }
   return Error::success();
 }
diff --git a/lib/DebugInfo/CodeView/TypeIndex.cpp b/lib/DebugInfo/CodeView/TypeIndex.cpp
new file mode 100644
index 000000000000..20ba6470cd5b
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeIndex.cpp
@@ -0,0 +1,27 @@
+//===-- TypeIndex.cpp - CodeView type index ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+void llvm::codeview::printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
+                                    TypeIndex TI, TypeCollection &Types) {
+  StringRef TypeName;
+  if (!TI.isNoneType())
+    TypeName = Types.getTypeName(TI);
+  if (!TypeName.empty())
+    Printer.printHex(FieldName, TypeName, TI.getIndex());
+  else
+    Printer.printHex(FieldName, TI.getIndex());
+}
diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp
index fd4d1853fa54..3b061e67e05e 100644
--- a/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -66,6 +66,31 @@ TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
   return Result.first->getValue();
 }
 
+TypeIndex
+TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
+                                          MutableArrayRef<uint8_t> Data) {
+  assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
+
+  StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());
+
+  // Do a two state lookup / insert so that we don't have to allocate unless
+  // we're going
+  // to do an insert.  This is a big memory savings.
+  auto Iter = HashedRecords.find(S);
+  if (Iter != HashedRecords.end())
+    return Iter->second;
+
+  LastTypeIndex = calcNextTypeIndex();
+  uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
+  ::memcpy(Copy, Data.data(), Data.size());
+  Data = MutableArrayRef<uint8_t>(Copy, Data.size());
+  S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
+  HashedRecords.insert(std::make_pair(S, LastTypeIndex));
+  SeenRecords.push_back(Data);
+  Record.RecordData = Data;
+  return LastTypeIndex;
+}
+
 Expected<MutableArrayRef<uint8_t>>
 TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
   uint32_t Align = Record.size() % 4;
@@ -137,11 +162,9 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
       reinterpret_cast<RecordPrefix *>(ThisRecordData.data());
   Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
 
-  uint8_t *Copy = RecordStorage.Allocate<uint8_t>(ThisRecordData.size());
-  ::memcpy(Copy, ThisRecordData.data(), ThisRecordData.size());
-  ThisRecordData = MutableArrayRef<uint8_t>(Copy, ThisRecordData.size());
-  Record = CVType(*TypeKind, ThisRecordData);
-  TypeIndex InsertedTypeIndex = insertRecordBytesPrivate(ThisRecordData);
+  Record.Type = *TypeKind;
+  TypeIndex InsertedTypeIndex =
+      insertRecordBytesWithCopy(Record, ThisRecordData);
 
   // Write out each additional segment in reverse order, and update each
   // record's continuation index to point to the previous one.
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 51f24fa3f135..46747f8eab99 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -11,11 +11,9 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -60,9 +58,12 @@ namespace {
 class TypeStreamMerger : public TypeVisitorCallbacks {
 public:
   TypeStreamMerger(TypeTableBuilder &DestIdStream,
-                   TypeTableBuilder &DestTypeStream, TypeServerHandler *Handler)
+                   TypeTableBuilder &DestTypeStream,
+                   SmallVectorImpl<TypeIndex> &SourceToDest,
+                   TypeServerHandler *Handler)
       : DestIdStream(DestIdStream), DestTypeStream(DestTypeStream),
-        FieldListBuilder(DestTypeStream), Handler(Handler) {}
+        FieldListBuilder(DestTypeStream), Handler(Handler),
+        IndexMap(SourceToDest) {}
 
   static const TypeIndex Untranslated;
 
@@ -143,7 +144,7 @@ private:
 
   /// Map from source type index to destination type index. Indexed by source
   /// type index minus 0x1000.
-  SmallVector<TypeIndex, 0> IndexMap;
+  SmallVectorImpl<TypeIndex> &IndexMap;
 };
 
 } // end anonymous namespace
@@ -477,8 +478,9 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
 
 Error llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestIdStream,
                                        TypeTableBuilder &DestTypeStream,
+                                       SmallVectorImpl<TypeIndex> &SourceToDest,
                                        TypeServerHandler *Handler,
                                        const CVTypeArray &Types) {
-  return TypeStreamMerger(DestIdStream, DestTypeStream, Handler)
+  return TypeStreamMerger(DestIdStream, DestTypeStream, SourceToDest, Handler)
       .mergeStream(Types);
 }
diff --git a/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
new file mode 100644
index 000000000000..a18710d6ab52
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -0,0 +1,83 @@
+//===- TypeTableCollection.cpp -------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
+
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static void error(Error &&EC) {
+  assert(!static_cast<bool>(EC));
+  if (EC)
+    consumeError(std::move(EC));
+}
+
+TypeTableCollection::TypeTableCollection(
+    ArrayRef<MutableArrayRef<uint8_t>> Records)
+    : Records(Records), Database(Records.size()) {}
+
+Optional<TypeIndex> TypeTableCollection::getFirst() {
+  if (empty())
+    return None;
+  return TypeIndex::fromArrayIndex(0);
+}
+
+Optional<TypeIndex> TypeTableCollection::getNext(TypeIndex Prev) {
+  ++Prev;
+  assert(Prev.toArrayIndex() <= size());
+  if (Prev.toArrayIndex() == size())
+    return None;
+  return Prev;
+}
+
+void TypeTableCollection::ensureTypeExists(TypeIndex Index) {
+  assert(hasCapacityFor(Index));
+
+  if (Database.contains(Index))
+    return;
+
+  BinaryByteStream Bytes(Records[Index.toArrayIndex()], support::little);
+
+  CVType Type;
+  uint32_t Len;
+  error(VarStreamArrayExtractor<CVType>::extract(Bytes, Len, Type));
+
+  TypeDatabaseVisitor DBV(Database);
+  error(codeview::visitTypeRecord(Type, Index, DBV));
+  assert(Database.contains(Index));
+}
+
+CVType TypeTableCollection::getType(TypeIndex Index) {
+  ensureTypeExists(Index);
+  return Database.getTypeRecord(Index);
+}
+
+StringRef TypeTableCollection::getTypeName(TypeIndex Index) {
+  if (!Index.isSimple())
+    ensureTypeExists(Index);
+  return Database.getTypeName(Index);
+}
+
+bool TypeTableCollection::contains(TypeIndex Index) {
+  return Database.contains(Index);
+}
+
+uint32_t TypeTableCollection::size() { return Records.size(); }
+
+uint32_t TypeTableCollection::capacity() { return Records.size(); }
+
+bool TypeTableCollection::hasCapacityFor(TypeIndex Index) const {
+  return Index.toArrayIndex() < Records.size();
+}
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 61e75a2b56ab..8e7c6c43d1a2 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -979,7 +979,7 @@ Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec,
     return Decompressor.takeError();
 
   SmallString<32> Out;
-  if (auto Err = Decompressor->decompress(Out))
+  if (auto Err = Decompressor->resizeAndDecompress(Out))
     return Err;
 
   UncompressedSections.emplace_back(std::move(Out));
@@ -1063,18 +1063,20 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
 
     // TODO: Add support for relocations in other sections as needed.
     // Record relocations for the debug_info and debug_line sections.
-    RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
-        .Case("debug_info", &InfoSection.Relocs)
-        .Case("debug_loc", &LocSection.Relocs)
-        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
-        .Case("debug_line", &LineSection.Relocs)
-        .Case("debug_ranges", &RangeSection.Relocs)
-        .Case("apple_names", &AppleNamesSection.Relocs)
-        .Case("apple_types", &AppleTypesSection.Relocs)
-        .Case("apple_namespaces", &AppleNamespacesSection.Relocs)
-        .Case("apple_namespac", &AppleNamespacesSection.Relocs)
-        .Case("apple_objc", &AppleObjCSection.Relocs)
-        .Default(nullptr);
+    RelocAddrMap *Map =
+        StringSwitch<RelocAddrMap *>(RelSecName)
+            .Case("debug_info", &InfoSection.Relocs)
+            .Case("debug_loc", &LocSection.Relocs)
+            .Case("debug_info.dwo", &InfoDWOSection.Relocs)
+            .Case("debug_line", &LineSection.Relocs)
+            .Case("debug_ranges", &RangeSection.Relocs)
+            .Case("debug_addr", &AddrSection.Relocs)
+            .Case("apple_names", &AppleNamesSection.Relocs)
+            .Case("apple_types", &AppleTypesSection.Relocs)
+            .Case("apple_namespaces", &AppleNamespacesSection.Relocs)
+            .Case("apple_namespac", &AppleNamespacesSection.Relocs)
+            .Case("apple_objc", &AppleObjCSection.Relocs)
+            .Default(nullptr);
     if (!Map) {
       // Find debug_types relocs by section rather than name as there are
       // multiple, comdat grouped, debug_types sections.
@@ -1104,14 +1106,14 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
       }
 
       object::RelocVisitor V(Obj);
-      object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr));
+      uint64_t Val = V.visit(Reloc.getType(), Reloc, *SymAddrOrErr);
       if (V.error()) {
         SmallString<32> Name;
         Reloc.getTypeName(Name);
         errs() << "error: failed to compute relocation: " << Name << "\n";
         continue;
       }
-      Map->insert({Reloc.getOffset(), {R.Value}});
+      Map->insert({Reloc.getOffset(), {Val}});
     }
   }
 }
@@ -1148,7 +1150,7 @@ StringRef *DWARFContextInMemory::MapSectionToMember(StringRef Name) {
       .Case("debug_line.dwo", &LineDWOSection.Data)
       .Case("debug_str.dwo", &StringDWOSection)
       .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-      .Case("debug_addr", &AddrSection)
+      .Case("debug_addr", &AddrSection.Data)
       .Case("apple_names", &AppleNamesSection.Data)
       .Case("apple_types", &AppleTypesSection.Data)
       .Case("apple_namespaces", &AppleNamespacesSection.Data)
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 3835d4da9ae9..c268afc222c3 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -33,7 +33,7 @@ using namespace dwarf;
 
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
   parseImpl(C, Section, C.getDebugAbbrev(), &C.getRangeSection(),
-            C.getStringSection(), StringRef(), C.getAddrSection(),
+            C.getStringSection(), StringRef(), &C.getAddrSection(),
             C.getLineSection().Data, C.isLittleEndian(), false);
 }
 
@@ -42,14 +42,14 @@ void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
                                     DWARFUnitIndex *Index) {
   parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), &C.getRangeDWOSection(),
             C.getStringDWOSection(), C.getStringOffsetDWOSection(),
-            C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(),
+            &C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(),
             true);
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
                      const DWARFDebugAbbrev *DA, const DWARFSection *RS,
-                     StringRef SS, StringRef SOS, StringRef AOS, StringRef LS,
-                     bool LE, bool IsDWO,
+                     StringRef SS, StringRef SOS, const DWARFSection *AOS,
+                     StringRef LS, bool LE, bool IsDWO,
                      const DWARFUnitSectionBase &UnitSection,
                      const DWARFUnitIndex::Entry *IndexEntry)
     : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
@@ -69,10 +69,10 @@ DWARFUnit::~DWARFUnit() = default;
 bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
                                                 uint64_t &Result) const {
   uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
-  if (AddrOffsetSection.size() < Offset + AddrSize)
+  if (AddrOffsetSection->Data.size() < Offset + AddrSize)
     return false;
-  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
-  Result = DA.getAddress(&Offset);
+  DataExtractor DA(AddrOffsetSection->Data, isLittleEndian, AddrSize);
+  Result = getRelocatedValue(DA, AddrSize, &Offset, &AddrOffsetSection->Relocs);
   return true;
 }
 
@@ -249,7 +249,7 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   return DieArray.size();
 }
 
-DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath) {
+DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath, uint64_t DWOId) {
   auto Obj = object::ObjectFile::createObjectFile(DWOPath);
   if (!Obj) {
     // TODO: Actually report errors helpfully.
@@ -259,8 +259,11 @@ DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath) {
   DWOFile = std::move(Obj.get());
   DWOContext.reset(
       cast<DWARFContext>(new DWARFContextInMemory(*DWOFile.getBinary())));
-  if (DWOContext->getNumDWOCompileUnits() > 0)
-    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
+  for (const auto &DWOCU : DWOContext->dwo_compile_units())
+    if (DWOCU->getDWOId() == DWOId) {
+      DWOU = DWOCU.get();
+      return;
+    }
 }
 
 bool DWARFUnit::parseDWO() {
@@ -281,10 +284,12 @@ bool DWARFUnit::parseDWO() {
     sys::path::append(AbsolutePath, *CompilationDir);
   }
   sys::path::append(AbsolutePath, *DWOFileName);
-  DWO = llvm::make_unique<DWOHolder>(AbsolutePath);
+  auto DWOId = getDWOId();
+  if (!DWOId)
+    return false;
+  DWO = llvm::make_unique<DWOHolder>(AbsolutePath, *DWOId);
   DWARFUnit *DWOCU = DWO->getUnit();
-  // Verify that compile unit in .dwo file is valid.
-  if (!DWOCU || DWOCU->getDWOId() != getDWOId()) {
+  if (!DWOCU) {
     DWO.reset();
     return false;
   }
diff --git a/lib/DebugInfo/PDB/Native/DbiStream.cpp b/lib/DebugInfo/PDB/Native/DbiStream.cpp
index f7538c580ba4..2f4fb6cc295d 100644
--- a/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -72,14 +72,6 @@ Error DbiStream::reload() {
     return make_error<RawError>(raw_error_code::feature_unsupported,
                                 "Unsupported DBI version.");
 
-  auto IS = Pdb.getPDBInfoStream();
-  if (!IS)
-    return IS.takeError();
-
-  if (Header->Age != IS->getAge())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "DBI Age does not match PDB Age.");
-
   if (Stream->getLength() !=
       sizeof(DbiStreamHeader) + Header->ModiSubstreamSize +
           Header->SecContrSubstreamSize + Header->SectionMapSize +
diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index 4dd965c69071..c6568029ec55 100644
--- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -117,6 +117,7 @@ Expected<uint32_t> PDBFileBuilder::getNamedStreamIndex(StringRef Name) const {
 }
 
 Error PDBFileBuilder::commit(StringRef Filename) {
+  assert(!Filename.empty());
   auto ExpectedLayout = finalizeMsfLayout();
   if (!ExpectedLayout)
     return ExpectedLayout.takeError();
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
index cb783cf4fea7..f00567db743e 100644
--- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
@@ -21,6 +21,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
 
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index c0999d93dbb9..8e0065873892 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -9,10 +9,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
@@ -91,9 +88,6 @@ Error TpiStream::reload() {
     HSR.setOffset(Header->HashValueBuffer.Off);
     if (auto EC = HSR.readArray(HashValues, NumHashValues))
       return EC;
-    std::vector<ulittle32_t> HashValueList;
-    for (auto I : HashValues)
-      HashValueList.push_back(I);
 
     HSR.setOffset(Header->IndexOffsetBuffer.Off);
     uint32_t NumTypeIndexOffsets =
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 701a318511b8..20456cc97823 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -69,7 +69,7 @@ Error TpiStreamBuilder::finalize() {
 
   uint32_t Count = TypeRecords.size();
 
-  H->Version = *VerHeader;
+  H->Version = VerHeader;
   H->HeaderSize = sizeof(TpiStreamHeader);
   H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
   H->TypeIndexEnd = H->TypeIndexBegin + Count;
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 49dbe74d25df..f454ae61d965 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -1947,7 +1947,7 @@ static const char *parse_type(const char *first, const char *last, C &db) {
               break;
             }
           }
-        // drop through
+        // falls through
         default:
           // must check for builtin-types before class-enum-types to avoid
           // ambiguities with operator-names
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index ce60367a6c8b..adb31d127a2e 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -1058,7 +1058,7 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment. For now, say
   // we can't change a known alignment.
-  unsigned OldAlign = getParamAlignment(Index);
+  unsigned OldAlign = getAttributes(Index).getAlignment();
   unsigned NewAlign = B.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 8bcba7672315..06934b365a11 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -521,6 +521,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
     }
+    break;
   }
   case 'o':
     // We only need to change the name to match the mangling including the
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 4b9d89cda539..8b0ff66334a7 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -37,10 +37,6 @@ using namespace llvm;
 //                              Constant Class
 //===----------------------------------------------------------------------===//
 
-void Constant::anchor() { }
-
-void ConstantData::anchor() {}
-
 bool Constant::isNegativeZeroValue() const {
   // Floating point values have an explicit -0.0 value.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
@@ -496,8 +492,6 @@ void Constant::removeDeadConstantUsers() const {
 //                                ConstantInt
 //===----------------------------------------------------------------------===//
 
-void ConstantInt::anchor() { }
-
 ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V)
     : ConstantData(Ty, ConstantIntVal), Val(V) {
   assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
@@ -610,8 +604,6 @@ static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
   return &APFloat::PPCDoubleDouble();
 }
 
-void ConstantFP::anchor() { }
-
 Constant *ConstantFP::get(Type *Ty, double V) {
   LLVMContext &Context = Ty->getContext();
 
@@ -2266,9 +2258,6 @@ Type *GetElementPtrConstantExpr::getResultElementType() const {
 //===----------------------------------------------------------------------===//
 //                       ConstantData* implementations
 
-void ConstantDataArray::anchor() {}
-void ConstantDataVector::anchor() {}
-
 Type *ConstantDataSequential::getElementType() const {
   return getType()->getElementType();
 }
@@ -2627,8 +2616,8 @@ Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
 }
 
-bool ConstantDataSequential::isString() const {
-  return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(8);
+bool ConstantDataSequential::isString(unsigned CharSize) const {
+  return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(CharSize);
 }
 
 bool ConstantDataSequential::isCString() const {
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index 25eb9452d9d0..6c189cf656de 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -44,8 +44,6 @@ namespace llvm {
 /// UnaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement unary constant exprs.
 class UnaryConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   UnaryConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
     : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
@@ -65,8 +63,6 @@ public:
 /// BinaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
                      unsigned Flags)
@@ -90,8 +86,6 @@ public:
 /// SelectConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement select constant exprs.
 class SelectConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   SelectConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C2->getType(), Instruction::Select, &Op<0>(), 3) {
@@ -115,8 +109,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractelement constant exprs.
 class ExtractElementConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   ExtractElementConstantExpr(Constant *C1, Constant *C2)
     : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
@@ -140,8 +132,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertelement constant exprs.
 class InsertElementConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C1->getType(), Instruction::InsertElement,
@@ -166,8 +156,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   ShuffleVectorConstantExpr(Constant *C1, Constant *C2, Constant *C3)
   : ConstantExpr(VectorType::get(
@@ -195,8 +183,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractvalue constant exprs.
 class ExtractValueConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   ExtractValueConstantExpr(Constant *Agg, ArrayRef<unsigned> IdxList,
                            Type *DestTy)
@@ -230,8 +216,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertvalue constant exprs.
 class InsertValueConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   InsertValueConstantExpr(Constant *Agg, Constant *Val,
                           ArrayRef<unsigned> IdxList, Type *DestTy)
@@ -271,8 +255,6 @@ class GetElementPtrConstantExpr : public ConstantExpr {
   GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
                             ArrayRef<Constant *> IdxList, Type *DestTy);
 
-  void anchor() override;
-
 public:
   static GetElementPtrConstantExpr *Create(Type *SrcElementTy, Constant *C,
                                            ArrayRef<Constant *> IdxList,
@@ -301,8 +283,6 @@ public:
 // behind the scenes to implement ICmp and FCmp constant expressions. This is
 // needed in order to store the predicate value for these instructions.
 class CompareConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   unsigned short predicate;
   CompareConstantExpr(Type *ty, Instruction::OtherOps opc,
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index c117d29b7f69..d5e29649a237 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -307,7 +307,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
     case 'a': {
       AlignTypeEnum AlignType;
       switch (Specifier) {
-      default:
+      default: llvm_unreachable("Unexpected specifier!");
       case 'i': AlignType = INTEGER_ALIGN; break;
       case 'v': AlignType = VECTOR_ALIGN; break;
       case 'f': AlignType = FLOAT_ALIGN; break;
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 16a9e51b8306..39de4b0a97fa 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -66,8 +66,6 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
 // Argument Implementation
 //===----------------------------------------------------------------------===//
 
-void Argument::anchor() {}
-
 Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
     : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
   setName(Name);
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index 3477c087967f..7572d0c6b3bc 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -381,8 +381,11 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
     Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
                                      NumElts));
 
+  if (!PassThru)
+    PassThru = UndefValue::get(DataTy);
+
   Type *OverloadedTypes[] = {DataTy, PtrsTy};
-  Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)};
+  Value * Ops[] = {Ptrs, getInt32(Align), Mask, PassThru};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 8feeeb65d445..6c0c5a267f81 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -40,10 +40,6 @@ InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
          "Function type not legal for constraints!");
 }
 
-// Implement the first virtual method in this class in this file so the
-// InlineAsm vtable is emitted here.
-InlineAsm::~InlineAsm() = default;
-
 InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
                           StringRef Constraints, bool hasSideEffects,
                           bool isAlignStack, AsmDialect asmDialect) {
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 91b9d9232b54..828e78b13005 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -43,8 +43,6 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
   InsertAtEnd->getInstList().push_back(this);
 }
 
-
-// Out of line virtual method, so the vtable, etc has a home.
 Instruction::~Instruction() {
   assert(!Parent && "Instruction still linked in the program!");
   if (hasMetadataHashEntry())
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 5a5b9c0d06bb..01d4ed6c8eef 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -59,9 +59,6 @@ User::op_iterator CallSite::getCallee() const {
 //                            TerminatorInst Class
 //===----------------------------------------------------------------------===//
 
-// Out of line virtual method, so the vtable, etc has a home.
-TerminatorInst::~TerminatorInst() = default;
-
 unsigned TerminatorInst::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
@@ -99,13 +96,6 @@ void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) {
 }
 
 //===----------------------------------------------------------------------===//
-//                           UnaryInstruction Class
-//===----------------------------------------------------------------------===//
-
-// Out of line virtual method, so the vtable, etc has a home.
-UnaryInstruction::~UnaryInstruction() = default;
-
-//===----------------------------------------------------------------------===//
 //                              SelectInst Class
 //===----------------------------------------------------------------------===//
 
@@ -138,8 +128,6 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
 //                               PHINode Class
 //===----------------------------------------------------------------------===//
 
-void PHINode::anchor() {}
-
 PHINode::PHINode(const PHINode &PN)
     : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()),
       ReservedSpace(PN.getNumOperands()) {
@@ -293,8 +281,6 @@ void LandingPadInst::addClause(Constant *Val) {
 //                        CallInst Implementation
 //===----------------------------------------------------------------------===//
 
-CallInst::~CallInst() = default;
-
 void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
                     ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
   this->FTy = FTy;
@@ -900,8 +886,6 @@ BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
   llvm_unreachable("ReturnInst has no successors!");
 }
 
-ReturnInst::~ReturnInst() = default;
-
 //===----------------------------------------------------------------------===//
 //                        ResumeInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1337,9 +1321,6 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
   setName(Name);
 }
 
-// Out of line virtual method, so the vtable, etc has a home.
-AllocaInst::~AllocaInst() = default;
-
 void AllocaInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
   assert(Align <= MaximumAlignment &&
@@ -1689,8 +1670,6 @@ FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
 //                       GetElementPtrInst Implementation
 //===----------------------------------------------------------------------===//
 
-void GetElementPtrInst::anchor() {}
-
 void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
                              const Twine &Name) {
   assert(getNumOperands() == 1 + IdxList.size() &&
@@ -2357,8 +2336,6 @@ float FPMathOperator::getFPAccuracy() const {
 //                                CastInst Class
 //===----------------------------------------------------------------------===//
 
-void CastInst::anchor() {}
-
 // Just determine if this cast only deals with integral->integral conversion.
 bool CastInst::isIntegerCast() const {
   switch (getOpcode()) {
@@ -3387,8 +3364,6 @@ AddrSpaceCastInst::AddrSpaceCastInst(
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
 
-void CmpInst::anchor() {}
-
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
                  Value *RHS, const Twine &Name, Instruction *InsertBefore)
   : Instruction(ty, op,
@@ -3528,8 +3503,6 @@ StringRef CmpInst::getPredicateName(Predicate Pred) {
   }
 }
 
-void ICmpInst::anchor() {}
-
 ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown icmp predicate!");
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 343722463e5f..4a30d28c3913 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -215,27 +215,6 @@ uint32_t LLVMContextImpl::getOperandBundleTagID(StringRef Tag) const {
   return I->second;
 }
 
-// ConstantsContext anchors
-void UnaryConstantExpr::anchor() { }
-
-void BinaryConstantExpr::anchor() { }
-
-void SelectConstantExpr::anchor() { }
-
-void ExtractElementConstantExpr::anchor() { }
-
-void InsertElementConstantExpr::anchor() { }
-
-void ShuffleVectorConstantExpr::anchor() { }
-
-void ExtractValueConstantExpr::anchor() { }
-
-void InsertValueConstantExpr::anchor() { }
-
-void GetElementPtrConstantExpr::anchor() { }
-
-void CompareConstantExpr::anchor() { }
-
 /// Singleton instance of the OptBisect class.
 ///
 /// This singleton is accessed via the LLVMContext::getOptBisect() function.  It
diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index 584dee2869c1..c0f6f07169ff 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp
@@ -105,8 +105,6 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
           ImplementationInfo->getNormalCtor() &&
           "Cannot specify pass as default if it does not have a default ctor");
       InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
-      InterfaceInfo->setTargetMachineCtor(
-          ImplementationInfo->getTargetMachineCtor());
     }
   }
 
diff --git a/lib/IR/User.cpp b/lib/IR/User.cpp
index 497b4aa17643..d46039107f33 100644
--- a/lib/IR/User.cpp
+++ b/lib/IR/User.cpp
@@ -19,8 +19,6 @@ class BasicBlock;
 //                                 User Class
 //===----------------------------------------------------------------------===//
 
-void User::anchor() {}
-
 void User::replaceUsesOfWith(Value *From, Value *To) {
   if (From == To) return;   // Duh what?
 
@@ -193,12 +191,4 @@ void User::operator delete(void *Usr) {
   }
 }
 
-//===----------------------------------------------------------------------===//
-//                             Operator Class
-//===----------------------------------------------------------------------===//
-
-Operator::~Operator() {
-  llvm_unreachable("should never destroy an Operator");
-}
-
 } // End llvm namespace
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 02b40c93b5d8..51a7d424c1f3 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DerivedUser.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -59,7 +60,7 @@ Value::Value(Type *ty, unsigned scid)
            (SubclassID < ConstantFirstVal || SubclassID > ConstantLastVal))
     assert((VTy->isFirstClassType() || VTy->isVoidTy()) &&
            "Cannot create non-first-class values except for constants!");
-  static_assert(sizeof(Value) == 3 * sizeof(void *) + 2 * sizeof(unsigned),
+  static_assert(sizeof(Value) == 2 * sizeof(void *) + 2 * sizeof(unsigned),
                 "Value too big");
 }
 
@@ -89,6 +90,32 @@ Value::~Value() {
   destroyValueName();
 }
 
+void Value::deleteValue() {
+  switch (getValueID()) {
+#define HANDLE_VALUE(Name)                                                     \
+  case Value::Name##Val:                                                       \
+    delete static_cast<Name *>(this);                                          \
+    break;
+#define HANDLE_MEMORY_VALUE(Name)                                              \
+  case Value::Name##Val:                                                       \
+    static_cast<DerivedUser *>(this)->DeleteValue(                             \
+        static_cast<DerivedUser *>(this));                                     \
+    break;
+#define HANDLE_INSTRUCTION(Name)  /* nothing */
+#include "llvm/IR/Value.def"
+
+#define HANDLE_INST(N, OPC, CLASS)                                             \
+  case Value::InstructionVal + Instruction::OPC:                               \
+    delete static_cast<CLASS *>(this);                                         \
+    break;
+#define HANDLE_USER_INST(N, OPC, CLASS)
+#include "llvm/IR/Instruction.def"
+
+  default:
+    llvm_unreachable("attempting to delete unknown value kind");
+  }
+}
+
 void Value::destroyValueName() {
   ValueName *Name = getValueName();
   if (Name)
diff --git a/lib/IR/ValueTypes.cpp b/lib/IR/ValueTypes.cpp
index 2132e1659225..cf6ee063c2d5 100644
--- a/lib/IR/ValueTypes.cpp
+++ b/lib/IR/ValueTypes.cpp
@@ -142,6 +142,7 @@ std::string EVT::getEVTString() const {
   case MVT::Other:   return "ch";
   case MVT::Glue:    return "glue";
   case MVT::x86mmx:  return "x86mmx";
+  case MVT::v1i1:    return "v1i1";
   case MVT::v2i1:    return "v2i1";
   case MVT::v4i1:    return "v4i1";
   case MVT::v8i1:    return "v8i1";
@@ -220,6 +221,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::f128:    return Type::getFP128Ty(Context);
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::v1i1:    return VectorType::get(Type::getInt1Ty(Context), 1);
   case MVT::v2i1:    return VectorType::get(Type::getInt1Ty(Context), 2);
   case MVT::v4i1:    return VectorType::get(Type::getInt1Ty(Context), 4);
   case MVT::v8i1:    return VectorType::get(Type::getInt1Ty(Context), 8);
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 3b68d6365872..21e8048442be 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -1317,6 +1317,12 @@ Verifier::visitModuleFlag(const MDNode *Op,
     Assert(Inserted,
            "module flag identifiers must be unique (or of 'require' type)", ID);
   }
+
+  if (ID->getString() == "wchar_size") {
+    ConstantInt *Value
+      = mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
+    Assert(Value, "wchar_size metadata requires constant integer argument");
+  }
 }
 
 /// Return true if this attribute kind only applies to functions.
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 65a7994325bc..ca3fc60f9501 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -25,9 +25,11 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Linker/Linker.h"
@@ -62,6 +64,7 @@ namespace llvm {
 extern cl::opt<bool> LTODiscardValueNames;
 extern cl::opt<std::string> LTORemarksFilename;
 extern cl::opt<bool> LTOPassRemarksWithHotness;
+extern cl::opt<bool> LTOStripInvalidDebugInfo;
 }
 
 namespace {
@@ -142,6 +145,30 @@ static void promoteModule(Module &TheModule, const ModuleSummaryIndex &Index) {
     report_fatal_error("renameModuleForThinLTO failed");
 }
 
+namespace {
+class ThinLTODiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+public:
+  ThinLTODiagnosticInfo(const Twine &DiagMsg,
+                        DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_Linker, Severity), Msg(DiagMsg) {}
+  void print(DiagnosticPrinter &DP) const override { DP << Msg; }
+};
+}
+
+/// Verify the module and strip broken debug info.
+static void verifyLoadedModule(Module &TheModule) {
+  bool BrokenDebugInfo = false;
+  if (verifyModule(TheModule, &dbgs(),
+                   LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
+    report_fatal_error("Broken module found, compilation aborted!");
+  if (BrokenDebugInfo) {
+    TheModule.getContext().diagnose(ThinLTODiagnosticInfo(
+        "Invalid debug info found, debug info will be stripped", DS_Warning));
+    StripDebugInfo(TheModule);
+  }
+}
+
 static std::unique_ptr<Module>
 loadModuleFromBuffer(const MemoryBufferRef &Buffer, LLVMContext &Context,
                      bool Lazy, bool IsImporting) {
@@ -159,6 +186,8 @@ loadModuleFromBuffer(const MemoryBufferRef &Buffer, LLVMContext &Context,
     });
     report_fatal_error("Can't load module, abort.");
   }
+  if (!Lazy)
+    verifyLoadedModule(*ModuleOrErr.get());
   return std::move(ModuleOrErr.get());
 }
 
@@ -181,6 +210,8 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
     });
     report_fatal_error("importFunctions failed");
   }
+  // Verify again after cross-importing.
+  verifyLoadedModule(TheModule);
 }
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
@@ -195,7 +226,8 @@ static void optimizeModule(Module &TheModule, TargetMachine &TM,
   PMB.OptLevel = OptLevel;
   PMB.LoopVectorize = true;
   PMB.SLPVectorize = true;
-  PMB.VerifyInput = true;
+  // Already did this in verifyLoadedModule().
+  PMB.VerifyInput = false;
   PMB.VerifyOutput = false;
 
   legacy::PassManager PM;
@@ -505,29 +537,25 @@ static void initTMBuilder(TargetMachineBuilder &TMBuilder,
 
 void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
   ThinLTOBuffer Buffer(Data, Identifier);
-  if (Modules.empty()) {
-    // First module added, so initialize the triple and some options
-    LLVMContext Context;
-    StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
-        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
-    if (TripleOrErr)
-      TripleStr = *TripleOrErr;
-    Triple TheTriple(TripleStr);
+  LLVMContext Context;
+  StringRef TripleStr;
+  ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
+      Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
+
+  if (TripleOrErr)
+    TripleStr = *TripleOrErr;
+
+  Triple TheTriple(TripleStr);
+
+  if (Modules.empty())
     initTMBuilder(TMBuilder, Triple(TheTriple));
+  else if (TMBuilder.TheTriple != TheTriple) {
+    if (!TMBuilder.TheTriple.isCompatibleWith(TheTriple))
+      report_fatal_error("ThinLTO modules with incompatible triples not "
+                         "supported");
+    initTMBuilder(TMBuilder, Triple(TMBuilder.TheTriple.merge(TheTriple)));
   }
-#ifndef NDEBUG
-  else {
-    LLVMContext Context;
-    StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
-        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
-    if (TripleOrErr)
-      TripleStr = *TripleOrErr;
-    assert(TMBuilder.TheTriple.str() == TripleStr &&
-           "ThinLTO modules with different triple not supported");
-  }
-#endif
+
   Modules.push_back(Buffer);
 }
 
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index ecef1efda1a2..c0af21aa148c 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -1243,27 +1243,6 @@ Error IRLinker::linkModuleFlagsMetadata() {
   return Error::success();
 }
 
-// This function returns true if the triples match.
-static bool triplesMatch(const Triple &T0, const Triple &T1) {
-  // If vendor is apple, ignore the version number.
-  if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
-
-  return T0 == T1;
-}
-
-// This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple,
-                                const Triple &DstTriple) {
-  // If vendor is apple, pick the triple with the larger version number.
-  if (SrcTriple.getVendor() == Triple::Apple)
-    if (DstTriple.isOSVersionLT(SrcTriple))
-      return SrcTriple.str();
-
-  return DstTriple.str();
-}
-
 Error IRLinker::run() {
   // Ensure metadata materialized before value mapping.
   if (SrcM->getMaterializer())
@@ -1289,14 +1268,15 @@ Error IRLinker::run() {
 
   Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM.getTargetTriple());
 
-  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+  if (!SrcM->getTargetTriple().empty()&&
+      !SrcTriple.isCompatibleWith(DstTriple))
     emitWarning("Linking two modules of different target triples: " +
                 SrcM->getModuleIdentifier() + "' is '" +
                 SrcM->getTargetTriple() + "' whereas '" +
                 DstM.getModuleIdentifier() + "' is '" + DstM.getTargetTriple() +
                 "'\n");
 
-  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+  DstM.setTargetTriple(SrcTriple.merge(DstTriple));
 
   // Append the module inline asm string.
   if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index 2b44c4a82d2c..116af3c917be 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/WindowsResource.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -71,9 +72,10 @@ Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
       return ObjectFile::createSymbolicFile(Buffer, Type, Context);
     case sys::fs::file_magic::macho_universal_binary:
       return MachOUniversalBinary::create(Buffer);
+    case sys::fs::file_magic::windows_resource:
+      return WindowsResource::createWindowsResource(Buffer);
     case sys::fs::file_magic::unknown:
     case sys::fs::file_magic::coff_cl_gl_object:
-    case sys::fs::file_magic::windows_resource:
       // Unrecognized object file format.
       return errorCodeToError(object_error::invalid_file_type);
   }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 08365e71c2f6..1d08a9efd8b3 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -2,6 +2,8 @@ add_llvm_library(LLVMObject
   Archive.cpp
   ArchiveWriter.cpp
   Binary.cpp
+  COFFImportFile.cpp
+  COFFModuleDefinition.cpp
   COFFObjectFile.cpp
   Decompressor.cpp
   ELF.cpp
@@ -18,6 +20,7 @@ add_llvm_library(LLVMObject
   SymbolicFile.cpp
   SymbolSize.cpp
   WasmObjectFile.cpp
+  WindowsResource.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Object
diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp
new file mode 100644
index 000000000000..37962d84d855
--- /dev/null
+++ b/lib/Object/COFFImportFile.cpp
@@ -0,0 +1,527 @@
+//===- COFFImportFile.cpp - COFF short import file implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the writeImportLibrary function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace llvm::COFF;
+using namespace llvm::object;
+using namespace llvm;
+
+namespace llvm {
+namespace object {
+
+static bool is32bit(MachineTypes Machine) {
+  switch (Machine) {
+  default:
+    llvm_unreachable("unsupported machine");
+  case IMAGE_FILE_MACHINE_AMD64:
+    return false;
+  case IMAGE_FILE_MACHINE_ARMNT:
+  case IMAGE_FILE_MACHINE_I386:
+    return true;
+  }
+}
+
+static uint16_t getImgRelRelocation(MachineTypes Machine) {
+  switch (Machine) {
+  default:
+    llvm_unreachable("unsupported machine");
+  case IMAGE_FILE_MACHINE_AMD64:
+    return IMAGE_REL_AMD64_ADDR32NB;
+  case IMAGE_FILE_MACHINE_ARMNT:
+    return IMAGE_REL_ARM_ADDR32NB;
+  case IMAGE_FILE_MACHINE_I386:
+    return IMAGE_REL_I386_DIR32NB;
+  }
+}
+
+template <class T> static void append(std::vector<uint8_t> &B, const T &Data) {
+  size_t S = B.size();
+  B.resize(S + sizeof(T));
+  memcpy(&B[S], &Data, sizeof(T));
+}
+
+static void writeStringTable(std::vector<uint8_t> &B,
+                             ArrayRef<const std::string> Strings) {
+  // The COFF string table consists of a 4-byte value which is the size of the
+  // table, including the length field itself.  This value is followed by the
+  // string content itself, which is an array of null-terminated C-style
+  // strings.  The termination is important as they are referenced to by offset
+  // by the symbol entity in the file format.
+
+  size_t Pos = B.size();
+  size_t Offset = B.size();
+
+  // Skip over the length field, we will fill it in later as we will have
+  // computed the length while emitting the string content itself.
+  Pos += sizeof(uint32_t);
+
+  for (const auto &S : Strings) {
+    B.resize(Pos + S.length() + 1);
+    strcpy(reinterpret_cast<char *>(&B[Pos]), S.c_str());
+    Pos += S.length() + 1;
+  }
+
+  // Backfill the length of the table now that it has been computed.
+  support::ulittle32_t Length(B.size() - Offset);
+  support::endian::write32le(&B[Offset], Length);
+}
+
+static ImportNameType getNameType(StringRef Sym, StringRef ExtName,
+                                  MachineTypes Machine) {
+  if (Sym != ExtName)
+    return IMPORT_NAME_UNDECORATE;
+  if (Machine == IMAGE_FILE_MACHINE_I386 && Sym.startswith("_"))
+    return IMPORT_NAME_NOPREFIX;
+  return IMPORT_NAME;
+}
+
+static Expected<std::string> replace(StringRef S, StringRef From,
+                                     StringRef To) {
+  size_t Pos = S.find(From);
+
+  // From and To may be mangled, but substrings in S may not.
+  if (Pos == StringRef::npos && From.startswith("_") && To.startswith("_")) {
+    From = From.substr(1);
+    To = To.substr(1);
+    Pos = S.find(From);
+  }
+
+  if (Pos == StringRef::npos) {
+    return make_error<StringError>(
+      StringRef(Twine(S + ": replacing '" + From +
+        "' with '" + To + "' failed").str()), object_error::parse_failed);
+  }
+
+  return (Twine(S.substr(0, Pos)) + To + S.substr(Pos + From.size())).str();
+}
+
+static const std::string NullImportDescriptorSymbolName =
+    "__NULL_IMPORT_DESCRIPTOR";
+
+namespace {
+// This class constructs various small object files necessary to support linking
+// symbols imported from a DLL.  The contents are pretty strictly defined and
+// nearly entirely static.  The details of the structures files are defined in
+// WINNT.h and the PE/COFF specification.
+class ObjectFactory {
+  using u16 = support::ulittle16_t;
+  using u32 = support::ulittle32_t;
+  MachineTypes Machine;
+  BumpPtrAllocator Alloc;
+  StringRef DLLName;
+  StringRef Library;
+  std::string ImportDescriptorSymbolName;
+  std::string NullThunkSymbolName;
+
+public:
+  ObjectFactory(StringRef S, MachineTypes M)
+      : Machine(M), DLLName(S), Library(S.drop_back(4)),
+        ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
+        NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}
+
+  // Creates an Import Descriptor.  This is a small object file which contains a
+  // reference to the terminators and contains the library name (entry) for the
+  // import name table.  It will force the linker to construct the necessary
+  // structure to import symbols from the DLL.
+  NewArchiveMember createImportDescriptor(std::vector<uint8_t> &Buffer);
+
+  // Creates a NULL import descriptor.  This is a small object file whcih
+  // contains a NULL import descriptor.  It is used to terminate the imports
+  // from a specific DLL.
+  NewArchiveMember createNullImportDescriptor(std::vector<uint8_t> &Buffer);
+
+  // Create a NULL Thunk Entry.  This is a small object file which contains a
+  // NULL Import Address Table entry and a NULL Import Lookup Table Entry.  It
+  // is used to terminate the IAT and ILT.
+  NewArchiveMember createNullThunk(std::vector<uint8_t> &Buffer);
+
+  // Create a short import file which is described in PE/COFF spec 7. Import
+  // Library Format.
+  NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
+                                     ImportType Type, ImportNameType NameType);
+};
+} // namespace
+
+NewArchiveMember
+ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
+  static const uint32_t NumberOfSections = 2;
+  static const uint32_t NumberOfSymbols = 7;
+  static const uint32_t NumberOfRelocations = 3;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(Machine),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
+          // .idata$2
+          sizeof(coff_import_directory_table_entry) +
+          NumberOfRelocations * sizeof(coff_relocation) +
+          // .idata$4
+          (DLLName.size() + 1)),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  static const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '2'},
+       u32(0),
+       u32(0),
+       u32(sizeof(coff_import_directory_table_entry)),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
+           sizeof(coff_import_directory_table_entry)),
+       u32(0),
+       u16(NumberOfRelocations),
+       u16(0),
+       u32(IMAGE_SCN_ALIGN_4BYTES | IMAGE_SCN_CNT_INITIALIZED_DATA |
+           IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE)},
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '6'},
+       u32(0),
+       u32(0),
+       u32(DLLName.size() + 1),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
+           sizeof(coff_import_directory_table_entry) +
+           NumberOfRelocations * sizeof(coff_relocation)),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_ALIGN_2BYTES | IMAGE_SCN_CNT_INITIALIZED_DATA |
+           IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE)},
+  };
+  append(Buffer, SectionTable);
+
+  // .idata$2
+  static const coff_import_directory_table_entry ImportDescriptor{
+      u32(0), u32(0), u32(0), u32(0), u32(0),
+  };
+  append(Buffer, ImportDescriptor);
+
+  static const coff_relocation RelocationTable[NumberOfRelocations] = {
+      {u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
+       u16(getImgRelRelocation(Machine))},
+      {u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
+       u32(3), u16(getImgRelRelocation(Machine))},
+      {u32(offsetof(coff_import_directory_table_entry, ImportAddressTableRVA)),
+       u32(4), u16(getImgRelRelocation(Machine))},
+  };
+  append(Buffer, RelocationTable);
+
+  // .idata$6
+  auto S = Buffer.size();
+  Buffer.resize(S + DLLName.size() + 1);
+  memcpy(&Buffer[S], DLLName.data(), DLLName.size());
+  Buffer[S + DLLName.size()] = '\0';
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '2'}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_SECTION,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '6'}},
+       u32(0),
+       u16(2),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '4'}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_SECTION,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '5'}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_SECTION,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+  };
+  reinterpret_cast<StringTableOffset &>(SymbolTable[0].Name).Offset =
+      sizeof(uint32_t);
+  reinterpret_cast<StringTableOffset &>(SymbolTable[5].Name).Offset =
+      sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1;
+  reinterpret_cast<StringTableOffset &>(SymbolTable[6].Name).Offset =
+      sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1 +
+      NullImportDescriptorSymbolName.length() + 1;
+  append(Buffer, SymbolTable);
+
+  // String Table
+  writeStringTable(Buffer,
+                   {ImportDescriptorSymbolName, NullImportDescriptorSymbolName,
+                    NullThunkSymbolName});
+
+  StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
+  return {MemoryBufferRef(F, DLLName)};
+}
+
+NewArchiveMember
+ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
+  static const uint32_t NumberOfSections = 1;
+  static const uint32_t NumberOfSymbols = 1;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(Machine),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
+          // .idata$3
+          sizeof(coff_import_directory_table_entry)),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  static const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '3'},
+       u32(0),
+       u32(0),
+       u32(sizeof(coff_import_directory_table_entry)),
+       u32(sizeof(coff_file_header) +
+           (NumberOfSections * sizeof(coff_section))),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_ALIGN_4BYTES | IMAGE_SCN_CNT_INITIALIZED_DATA |
+           IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE)},
+  };
+  append(Buffer, SectionTable);
+
+  // .idata$3
+  static const coff_import_directory_table_entry ImportDescriptor{
+      u32(0), u32(0), u32(0), u32(0), u32(0),
+  };
+  append(Buffer, ImportDescriptor);
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+  };
+  reinterpret_cast<StringTableOffset &>(SymbolTable[0].Name).Offset =
+      sizeof(uint32_t);
+  append(Buffer, SymbolTable);
+
+  // String Table
+  writeStringTable(Buffer, {NullImportDescriptorSymbolName});
+
+  StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
+  return {MemoryBufferRef(F, DLLName)};
+}
+
+NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
+  static const uint32_t NumberOfSections = 2;
+  static const uint32_t NumberOfSymbols = 1;
+  uint32_t VASize = is32bit(Machine) ? 4 : 8;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(Machine),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
+          // .idata$5
+          VASize +
+          // .idata$4
+          VASize),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  static const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '5'},
+       u32(0),
+       u32(0),
+       u32(VASize),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
+                             : IMAGE_SCN_ALIGN_8BYTES) |
+           IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+           IMAGE_SCN_MEM_WRITE)},
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '4'},
+       u32(0),
+       u32(0),
+       u32(VASize),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
+           VASize),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
+                             : IMAGE_SCN_ALIGN_8BYTES) |
+           IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+           IMAGE_SCN_MEM_WRITE)},
+  };
+  append(Buffer, SectionTable);
+
+  // .idata$5, ILT
+  append(Buffer, u32(0));
+  if (!is32bit(Machine))
+    append(Buffer, u32(0));
+
+  // .idata$4, IAT
+  append(Buffer, u32(0));
+  if (!is32bit(Machine))
+    append(Buffer, u32(0));
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+  };
+  reinterpret_cast<StringTableOffset &>(SymbolTable[0].Name).Offset =
+      sizeof(uint32_t);
+  append(Buffer, SymbolTable);
+
+  // String Table
+  writeStringTable(Buffer, {NullThunkSymbolName});
+
+  StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
+  return {MemoryBufferRef{F, DLLName}};
+}
+
+NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
+                                                  uint16_t Ordinal,
+                                                  ImportType ImportType,
+                                                  ImportNameType NameType) {
+  size_t ImpSize = DLLName.size() + Sym.size() + 2; // +2 for NULs
+  size_t Size = sizeof(coff_import_header) + ImpSize;
+  char *Buf = Alloc.Allocate<char>(Size);
+  memset(Buf, 0, Size);
+  char *P = Buf;
+
+  // Write short import library.
+  auto *Imp = reinterpret_cast<coff_import_header *>(P);
+  P += sizeof(*Imp);
+  Imp->Sig2 = 0xFFFF;
+  Imp->Machine = Machine;
+  Imp->SizeOfData = ImpSize;
+  if (Ordinal > 0)
+    Imp->OrdinalHint = Ordinal;
+  Imp->TypeInfo = (NameType << 2) | ImportType;
+
+  // Write symbol name and DLL name.
+  memcpy(P, Sym.data(), Sym.size());
+  P += Sym.size() + 1;
+  memcpy(P, DLLName.data(), DLLName.size());
+
+  return {MemoryBufferRef(StringRef(Buf, Size), DLLName)};
+}
+
+std::error_code writeImportLibrary(StringRef DLLName, StringRef Path,
+                                   ArrayRef<COFFShortExport> Exports,
+                                   MachineTypes Machine) {
+
+  std::vector<NewArchiveMember> Members;
+  ObjectFactory OF(llvm::sys::path::filename(DLLName), Machine);
+
+  std::vector<uint8_t> ImportDescriptor;
+  Members.push_back(OF.createImportDescriptor(ImportDescriptor));
+
+  std::vector<uint8_t> NullImportDescriptor;
+  Members.push_back(OF.createNullImportDescriptor(NullImportDescriptor));
+
+  std::vector<uint8_t> NullThunk;
+  Members.push_back(OF.createNullThunk(NullThunk));
+
+  for (COFFShortExport E : Exports) {
+    if (E.Private)
+      continue;
+
+    ImportType ImportType = IMPORT_CODE;
+    if (E.Data)
+      ImportType = IMPORT_DATA;
+    if (E.Constant)
+      ImportType = IMPORT_CONST;
+
+    StringRef SymbolName = E.isWeak() ? E.ExtName : E.Name;
+    ImportNameType NameType = getNameType(SymbolName, E.Name, Machine);
+    Expected<std::string> Name = E.ExtName.empty()
+                                     ? SymbolName
+                                     : replace(SymbolName, E.Name, E.ExtName);
+
+    if (!Name) {
+      return errorToErrorCode(Name.takeError());
+    }
+
+    Members.push_back(
+        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType));
+  }
+
+  std::pair<StringRef, std::error_code> Result =
+      writeArchive(Path, Members, /*WriteSymtab*/ true, object::Archive::K_GNU,
+                   /*Deterministic*/ true, /*Thin*/ false);
+
+  return Result.second;
+}
+
+} // namespace object
+} // namespace llvm
diff --git a/lib/Object/COFFModuleDefinition.cpp b/lib/Object/COFFModuleDefinition.cpp
new file mode 100644
index 000000000000..0d69cb6b709c
--- /dev/null
+++ b/lib/Object/COFFModuleDefinition.cpp
@@ -0,0 +1,319 @@
+//===--- COFFModuleDefinition.cpp - Simple DEF parser ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Windows-specific.
+// A parser for the module-definition file (.def file).
+//
+// The format of module-definition files are described in this document:
+// https://msdn.microsoft.com/en-us/library/28d6s79h.aspx
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/COFFModuleDefinition.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm::COFF;
+using namespace llvm;
+
+namespace llvm {
+namespace object {
+
+enum Kind {
+  Unknown,
+  Eof,
+  Identifier,
+  Comma,
+  Equal,
+  KwBase,
+  KwConstant,
+  KwData,
+  KwExports,
+  KwHeapsize,
+  KwLibrary,
+  KwName,
+  KwNoname,
+  KwPrivate,
+  KwStacksize,
+  KwVersion,
+};
+
+struct Token {
+  explicit Token(Kind T = Unknown, StringRef S = "") : K(T), Value(S) {}
+  Kind K;
+  StringRef Value;
+};
+
+static bool isDecorated(StringRef Sym) {
+  return Sym.startswith("_") || Sym.startswith("@") || Sym.startswith("?");
+}
+
+static Error createError(const Twine &Err) {
+  return make_error<StringError>(StringRef(Err.str()),
+                                 object_error::parse_failed);
+}
+
+class Lexer {
+public:
+  Lexer(StringRef S) : Buf(S) {}
+
+  Token lex() {
+    Buf = Buf.trim();
+    if (Buf.empty())
+      return Token(Eof);
+
+    switch (Buf[0]) {
+    case '\0':
+      return Token(Eof);
+    case ';': {
+      size_t End = Buf.find('\n');
+      Buf = (End == Buf.npos) ? "" : Buf.drop_front(End);
+      return lex();
+    }
+    case '=':
+      Buf = Buf.drop_front();
+      return Token(Equal, "=");
+    case ',':
+      Buf = Buf.drop_front();
+      return Token(Comma, ",");
+    case '"': {
+      StringRef S;
+      std::tie(S, Buf) = Buf.substr(1).split('"');
+      return Token(Identifier, S);
+    }
+    default: {
+      size_t End = Buf.find_first_of("=,\r\n \t\v");
+      StringRef Word = Buf.substr(0, End);
+      Kind K = llvm::StringSwitch<Kind>(Word)
+                   .Case("BASE", KwBase)
+                   .Case("CONSTANT", KwConstant)
+                   .Case("DATA", KwData)
+                   .Case("EXPORTS", KwExports)
+                   .Case("HEAPSIZE", KwHeapsize)
+                   .Case("LIBRARY", KwLibrary)
+                   .Case("NAME", KwName)
+                   .Case("NONAME", KwNoname)
+                   .Case("PRIVATE", KwPrivate)
+                   .Case("STACKSIZE", KwStacksize)
+                   .Case("VERSION", KwVersion)
+                   .Default(Identifier);
+      Buf = (End == Buf.npos) ? "" : Buf.drop_front(End);
+      return Token(K, Word);
+    }
+    }
+  }
+
+private:
+  StringRef Buf;
+};
+
+class Parser {
+public:
+  explicit Parser(StringRef S, MachineTypes M) : Lex(S), Machine(M) {}
+
+  Expected<COFFModuleDefinition> parse() {
+    do {
+      if (Error Err = parseOne())
+        return std::move(Err);
+    } while (Tok.K != Eof);
+    return Info;
+  }
+
+private:
+  void read() {
+    if (Stack.empty()) {
+      Tok = Lex.lex();
+      return;
+    }
+    Tok = Stack.back();
+    Stack.pop_back();
+  }
+
+  Error readAsInt(uint64_t *I) {
+    read();
+    if (Tok.K != Identifier || Tok.Value.getAsInteger(10, *I))
+      return createError("integer expected");
+    return Error::success();
+  }
+
+  Error expect(Kind Expected, StringRef Msg) {
+    read();
+    if (Tok.K != Expected)
+      return createError(Msg);
+    return Error::success();
+  }
+
+  void unget() { Stack.push_back(Tok); }
+
+  Error parseOne() {
+    read();
+    switch (Tok.K) {
+    case Eof:
+      return Error::success();
+    case KwExports:
+      for (;;) {
+        read();
+        if (Tok.K != Identifier) {
+          unget();
+          return Error::success();
+        }
+        if (Error Err = parseExport())
+          return Err;
+      }
+    case KwHeapsize:
+      return parseNumbers(&Info.HeapReserve, &Info.HeapCommit);
+    case KwStacksize:
+      return parseNumbers(&Info.StackReserve, &Info.StackCommit);
+    case KwLibrary:
+    case KwName: {
+      bool IsDll = Tok.K == KwLibrary; // Check before parseName.
+      std::string Name;
+      if (Error Err = parseName(&Name, &Info.ImageBase))
+        return Err;
+      // Append the appropriate file extension if not already present.
+      StringRef Ext = IsDll ? ".dll" : ".exe";
+      if (!StringRef(Name).endswith_lower(Ext))
+        Name += Ext;
+
+      // Set the output file, but don't override /out if it was already passed.
+      if (Info.OutputFile.empty())
+        Info.OutputFile = Name;
+      return Error::success();
+    }
+    case KwVersion:
+      return parseVersion(&Info.MajorImageVersion, &Info.MinorImageVersion);
+    default:
+      return createError("unknown directive: " + Tok.Value);
+    }
+  }
+
+  Error parseExport() {
+    COFFShortExport E;
+    E.Name = Tok.Value;
+    read();
+    if (Tok.K == Equal) {
+      read();
+      if (Tok.K != Identifier)
+        return createError("identifier expected, but got " + Tok.Value);
+      E.ExtName = E.Name;
+      E.Name = Tok.Value;
+    } else {
+      unget();
+    }
+
+    if (Machine == IMAGE_FILE_MACHINE_I386) {
+      if (!isDecorated(E.Name))
+        E.Name = (std::string("_").append(E.Name));
+      if (!E.ExtName.empty() && !isDecorated(E.ExtName))
+        E.ExtName = (std::string("_").append(E.ExtName));
+    }
+
+    for (;;) {
+      read();
+      if (Tok.K == Identifier && Tok.Value[0] == '@') {
+        Tok.Value.drop_front().getAsInteger(10, E.Ordinal);
+        read();
+        if (Tok.K == KwNoname) {
+          E.Noname = true;
+        } else {
+          unget();
+        }
+        continue;
+      }
+      if (Tok.K == KwData) {
+        E.Data = true;
+        continue;
+      }
+      if (Tok.K == KwConstant) {
+        E.Constant = true;
+        continue;
+      }
+      if (Tok.K == KwPrivate) {
+        E.Private = true;
+        continue;
+      }
+      unget();
+      Info.Exports.push_back(E);
+      return Error::success();
+    }
+  }
+
+  // HEAPSIZE/STACKSIZE reserve[,commit]
+  Error parseNumbers(uint64_t *Reserve, uint64_t *Commit) {
+    if (Error Err = readAsInt(Reserve))
+      return Err;
+    read();
+    if (Tok.K != Comma) {
+      unget();
+      Commit = nullptr;
+      return Error::success();
+    }
+    if (Error Err = readAsInt(Commit))
+      return Err;
+    return Error::success();
+  }
+
+  // NAME outputPath [BASE=address]
+  Error parseName(std::string *Out, uint64_t *Baseaddr) {
+    read();
+    if (Tok.K == Identifier) {
+      *Out = Tok.Value;
+    } else {
+      *Out = "";
+      unget();
+      return Error::success();
+    }
+    read();
+    if (Tok.K == KwBase) {
+      if (Error Err = expect(Equal, "'=' expected"))
+        return Err;
+      if (Error Err = readAsInt(Baseaddr))
+        return Err;
+    } else {
+      unget();
+      *Baseaddr = 0;
+    }
+    return Error::success();
+  }
+
+  // VERSION major[.minor]
+  Error parseVersion(uint32_t *Major, uint32_t *Minor) {
+    read();
+    if (Tok.K != Identifier)
+      return createError("identifier expected, but got " + Tok.Value);
+    StringRef V1, V2;
+    std::tie(V1, V2) = Tok.Value.split('.');
+    if (V1.getAsInteger(10, *Major))
+      return createError("integer expected, but got " + Tok.Value);
+    if (V2.empty())
+      *Minor = 0;
+    else if (V2.getAsInteger(10, *Minor))
+      return createError("integer expected, but got " + Tok.Value);
+    return Error::success();
+  }
+
+  Lexer Lex;
+  Token Tok;
+  std::vector<Token> Stack;
+  MachineTypes Machine;
+  COFFModuleDefinition Info;
+};
+
+Expected<COFFModuleDefinition> parseCOFFModuleDefinition(MemoryBufferRef MB,
+                                                         MachineTypes Machine) {
+  return Parser(MB.getBuffer(), Machine).parse();
+}
+
+} // namespace object
+} // namespace llvm
diff --git a/lib/Object/Decompressor.cpp b/lib/Object/Decompressor.cpp
index 0be602b1fc1a..89d199a3f3f6 100644
--- a/lib/Object/Decompressor.cpp
+++ b/lib/Object/Decompressor.cpp
@@ -88,11 +88,6 @@ bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) {
   return (Flags & ELF::SHF_COMPRESSED) || isGnuStyle(Name);
 }
 
-Error Decompressor::decompress(SmallString<32> &Out) {
-  Out.resize(DecompressedSize);
-  return decompress({Out.data(), (size_t)DecompressedSize});
-}
-
 Error Decompressor::decompress(MutableArrayRef<char> Buffer) {
   size_t Size = Buffer.size();
   return zlib::uncompress(SectionData, Buffer.data(), Size);
diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp
new file mode 100644
index 000000000000..b52563469094
--- /dev/null
+++ b/lib/Object/WindowsResource.cpp
@@ -0,0 +1,90 @@
+//===-- WindowsResource.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the .res file class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/WindowsResource.h"
+#include "llvm/Object/Error.h"
+#include <system_error>
+
+namespace llvm {
+namespace object {
+
+static const size_t ResourceMagicSize = 16;
+
+static const size_t NullEntrySize = 16;
+
+#define RETURN_IF_ERROR(X)                                                     \
+  if (auto EC = X)                                                             \
+    return EC;
+
+WindowsResource::WindowsResource(MemoryBufferRef Source)
+    : Binary(Binary::ID_WinRes, Source) {
+  size_t LeadingSize = ResourceMagicSize + NullEntrySize;
+  BBS = BinaryByteStream(Data.getBuffer().drop_front(LeadingSize),
+                         support::little);
+}
+
+WindowsResource::~WindowsResource() = default;
+
+Expected<std::unique_ptr<WindowsResource>>
+WindowsResource::createWindowsResource(MemoryBufferRef Source) {
+  if (Source.getBufferSize() < ResourceMagicSize + NullEntrySize)
+    return make_error<GenericBinaryError>(
+        "File too small to be a resource file",
+        object_error::invalid_file_type);
+  std::unique_ptr<WindowsResource> Ret(new WindowsResource(Source));
+  return std::move(Ret);
+}
+
+Expected<ResourceEntryRef> WindowsResource::getHeadEntry() {
+  Error Err = Error::success();
+  auto Ref = ResourceEntryRef(BinaryStreamRef(BBS), this, Err);
+  if (Err)
+    return std::move(Err);
+  return Ref;
+}
+
+ResourceEntryRef::ResourceEntryRef(BinaryStreamRef Ref,
+                                   const WindowsResource *Owner, Error &Err)
+    : Reader(Ref), OwningRes(Owner) {
+  if (loadNext())
+    Err = make_error<GenericBinaryError>("Could not read first entry.",
+                                         object_error::unexpected_eof);
+}
+
+Error ResourceEntryRef::moveNext(bool &End) {
+  // Reached end of all the entries.
+  if (Reader.bytesRemaining() == 0) {
+    End = true;
+    return Error::success();
+  }
+  RETURN_IF_ERROR(loadNext());
+
+  return Error::success();
+}
+
+Error ResourceEntryRef::loadNext() {
+  uint32_t DataSize;
+  RETURN_IF_ERROR(Reader.readInteger(DataSize));
+  uint32_t HeaderSize;
+  RETURN_IF_ERROR(Reader.readInteger(HeaderSize));
+  // The data and header size ints are themselves part of the header, so we must
+  // subtract them from the size.
+  RETURN_IF_ERROR(
+      Reader.readStreamRef(HeaderBytes, HeaderSize - 2 * sizeof(uint32_t)));
+  RETURN_IF_ERROR(Reader.readStreamRef(DataBytes, DataSize));
+  RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t)));
+  return Error::success();
+}
+
+} // namespace object
+} // namespace llvm
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 7076e751071d..6ece7965ce64 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -150,6 +150,10 @@ using namespace llvm;
 
 static cl::opt<unsigned> MaxDevirtIterations("pm-max-devirt-iterations",
                                              cl::ReallyHidden, cl::init(4));
+static cl::opt<bool>
+    RunPartialInlining("enable-npm-partial-inlining", cl::init(false),
+                       cl::Hidden, cl::ZeroOrMore,
+                       cl::desc("Run Partial inlinining pass"));
 
 static cl::opt<bool> EnableGVNHoist(
     "enable-npm-gvn-hoist", cl::init(false), cl::Hidden,
@@ -552,6 +556,11 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // At this point, we expect to have canonical and simple IR which we begin
   // *optimizing* for efficient execution going forward.
 
+  // Run partial inlining pass to partially inline functions that have
+  // large bodies.
+  if (RunPartialInlining)
+    MPM.addPass(PartialInlinerPass());
+
   // Eliminate externally available functions now that inlining is over -- we
   // won't emit these anyways.
   MPM.addPass(EliminateAvailableExternallyPass());
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 17144522db82..2a916b14bc22 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1398,8 +1398,8 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
   DEBUG(dbgs() << '\n');
 }
 
-void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
-                   unsigned rhsWords, APInt *Quotient, APInt *Remainder) {
+void APInt::divide(const WordType *LHS, unsigned lhsWords, const WordType *RHS,
+                   unsigned rhsWords, WordType *Quotient, WordType *Remainder) {
   assert(lhsWords >= rhsWords && "Fractional result");
 
   // First, compose the values into an array of 32-bit words instead of
@@ -1436,7 +1436,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // Initialize the dividend
   memset(U, 0, (m+n+1)*sizeof(uint32_t));
   for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = LHS.getRawData()[i];
+    uint64_t tmp = LHS[i];
     U[i * 2] = Lo_32(tmp);
     U[i * 2 + 1] = Hi_32(tmp);
   }
@@ -1445,7 +1445,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // Initialize the divisor
   memset(V, 0, (n)*sizeof(uint32_t));
   for (unsigned i = 0; i < rhsWords; ++i) {
-    uint64_t tmp = RHS.getRawData()[i];
+    uint64_t tmp = RHS[i];
     V[i * 2] = Lo_32(tmp);
     V[i * 2 + 1] = Hi_32(tmp);
   }
@@ -1502,48 +1502,14 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
 
   // If the caller wants the quotient
   if (Quotient) {
-    // Set up the Quotient value's memory.
-    Quotient->reallocate(LHS.BitWidth);
-    // Clear out any previous bits.
-    Quotient->clearAllBits();
-
-    // The quotient is in Q. Reconstitute the quotient into Quotient's low
-    // order words.
-    // This case is currently dead as all users of divide() handle trivial cases
-    // earlier.
-    if (lhsWords == 1) {
-      uint64_t tmp = Make_64(Q[1], Q[0]);
-      if (Quotient->isSingleWord())
-        Quotient->U.VAL = tmp;
-      else
-        Quotient->U.pVal[0] = tmp;
-    } else {
-      assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
-      for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->U.pVal[i] = Make_64(Q[i*2+1], Q[i*2]);
-    }
+    for (unsigned i = 0; i < lhsWords; ++i)
+      Quotient[i] = Make_64(Q[i*2+1], Q[i*2]);
   }
 
   // If the caller wants the remainder
   if (Remainder) {
-    // Set up the Remainder value's memory.
-    Remainder->reallocate(RHS.BitWidth);
-    // Clear out any previous bits.
-    Remainder->clearAllBits();
-
-    // The remainder is in R. Reconstitute the remainder into Remainder's low
-    // order words.
-    if (rhsWords == 1) {
-      uint64_t tmp = Make_64(R[1], R[0]);
-      if (Remainder->isSingleWord())
-        Remainder->U.VAL = tmp;
-      else
-        Remainder->U.pVal[0] = tmp;
-    } else {
-      assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
-      for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->U.pVal[i] = Make_64(R[i*2+1], R[i*2]);
-    }
+    for (unsigned i = 0; i < rhsWords; ++i)
+      Remainder[i] = Make_64(R[i*2+1], R[i*2]);
   }
 
   // Clean up the memory we allocated.
@@ -1555,7 +1521,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   }
 }
 
-APInt APInt::udiv(const APInt& RHS) const {
+APInt APInt::udiv(const APInt &RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
 
   // First, deal with the easy case
@@ -1588,8 +1554,41 @@ APInt APInt::udiv(const APInt& RHS) const {
     return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]);
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Quotient; // to hold result.
-  divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
+  APInt Quotient(BitWidth, 0); // to hold result.
+  divide(U.pVal, lhsWords, RHS.U.pVal, rhsWords, Quotient.U.pVal, nullptr);
+  return Quotient;
+}
+
+APInt APInt::udiv(uint64_t RHS) const {
+  assert(RHS != 0 && "Divide by zero?");
+
+  // First, deal with the easy case
+  if (isSingleWord())
+    return APInt(BitWidth, U.VAL / RHS);
+
+  // Get some facts about the LHS words.
+  unsigned lhsWords = getNumWords(getActiveBits());
+
+  // Deal with some degenerate cases
+  if (!lhsWords)
+    // 0 / X ===> 0
+    return APInt(BitWidth, 0);
+  if (RHS == 1)
+    // X / 1 ===> X
+    return *this;
+  if (this->ult(RHS))
+    // X / Y ===> 0, iff X < Y
+    return APInt(BitWidth, 0);
+  if (*this == RHS)
+    // X / X ===> 1
+    return APInt(BitWidth, 1);
+  if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
+    // All high words are zero, just use native divide
+    return APInt(BitWidth, this->U.pVal[0] / RHS);
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Quotient(BitWidth, 0); // to hold result.
+  divide(U.pVal, lhsWords, &RHS, 1, Quotient.U.pVal, nullptr);
   return Quotient;
 }
 
@@ -1604,7 +1603,18 @@ APInt APInt::sdiv(const APInt &RHS) const {
   return this->udiv(RHS);
 }
 
-APInt APInt::urem(const APInt& RHS) const {
+APInt APInt::sdiv(int64_t RHS) const {
+  if (isNegative()) {
+    if (RHS < 0)
+      return (-(*this)).udiv(-RHS);
+    return -((-(*this)).udiv(RHS));
+  }
+  if (RHS < 0)
+    return -(this->udiv(-RHS));
+  return this->udiv(RHS);
+}
+
+APInt APInt::urem(const APInt &RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord()) {
     assert(RHS.U.VAL != 0 && "Remainder by zero?");
@@ -1637,8 +1647,40 @@ APInt APInt::urem(const APInt& RHS) const {
     return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]);
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Remainder;
-  divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
+  APInt Remainder(BitWidth, 0);
+  divide(U.pVal, lhsWords, RHS.U.pVal, rhsWords, nullptr, Remainder.U.pVal);
+  return Remainder;
+}
+
+uint64_t APInt::urem(uint64_t RHS) const {
+  assert(RHS != 0 && "Remainder by zero?");
+
+  if (isSingleWord())
+    return U.VAL % RHS;
+
+  // Get some facts about the LHS
+  unsigned lhsWords = getNumWords(getActiveBits());
+
+  // Check the degenerate cases
+  if (lhsWords == 0)
+    // 0 % Y ===> 0
+    return 0;
+  if (RHS == 1)
+    // X % 1 ===> 0
+    return 0;
+  if (this->ult(RHS))
+    // X % Y ===> X, iff X < Y
+    return getZExtValue();
+  if (*this == RHS)
+    // X % X == 0;
+    return 0;
+  if (lhsWords == 1)
+    // All high words are zero, just use native remainder
+    return U.pVal[0] % RHS;
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  uint64_t Remainder;
+  divide(U.pVal, lhsWords, &RHS, 1, nullptr, &Remainder);
   return Remainder;
 }
 
@@ -1653,6 +1695,17 @@ APInt APInt::srem(const APInt &RHS) const {
   return this->urem(RHS);
 }
 
+int64_t APInt::srem(int64_t RHS) const {
+  if (isNegative()) {
+    if (RHS < 0)
+      return -((-(*this)).urem(-RHS));
+    return -((-(*this)).urem(RHS));
+  }
+  if (RHS < 0)
+    return this->urem(-RHS);
+  return this->urem(RHS);
+}
+
 void APInt::udivrem(const APInt &LHS, const APInt &RHS,
                     APInt &Quotient, APInt &Remainder) {
   assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
@@ -1698,20 +1751,90 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
+  // Make sure there is enough space to hold the results.
+  // NOTE: This assumes that reallocate won't affect any bits if it doesn't
+  // change the size. This is necessary if Quotient or Remainder is aliased
+  // with LHS or RHS.
+  Quotient.reallocate(BitWidth);
+  Remainder.reallocate(BitWidth);
+
   if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
     // There is only one word to consider so use the native versions.
     uint64_t lhsValue = LHS.U.pVal[0];
     uint64_t rhsValue = RHS.U.pVal[0];
-    // Make sure there is enough space to hold the results.
-    Quotient.reallocate(BitWidth);
-    Remainder.reallocate(BitWidth);
     Quotient = lhsValue / rhsValue;
     Remainder = lhsValue % rhsValue;
     return;
   }
 
   // Okay, lets do it the long way
-  divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder);
+  divide(LHS.U.pVal, lhsWords, RHS.U.pVal, rhsWords, Quotient.U.pVal,
+         Remainder.U.pVal);
+  // Clear the rest of the Quotient and Remainder.
+  std::memset(Quotient.U.pVal + lhsWords, 0,
+              (getNumWords(BitWidth) - lhsWords) * APINT_WORD_SIZE);
+  std::memset(Remainder.U.pVal + rhsWords, 0,
+              (getNumWords(BitWidth) - rhsWords) * APINT_WORD_SIZE);
+}
+
+void APInt::udivrem(const APInt &LHS, uint64_t RHS, APInt &Quotient,
+                    uint64_t &Remainder) {
+  assert(RHS != 0 && "Divide by zero?");
+  unsigned BitWidth = LHS.BitWidth;
+
+  // First, deal with the easy case
+  if (LHS.isSingleWord()) {
+    uint64_t QuotVal = LHS.U.VAL / RHS;
+    Remainder = LHS.U.VAL % RHS;
+    Quotient = APInt(BitWidth, QuotVal);
+    return;
+  }
+
+  // Get some size facts about the dividend and divisor
+  unsigned lhsWords = getNumWords(LHS.getActiveBits());
+
+  // Check the degenerate cases
+  if (lhsWords == 0) {
+    Quotient = 0;                // 0 / Y ===> 0
+    Remainder = 0;               // 0 % Y ===> 0
+    return;
+  }
+
+  if (RHS == 1) {
+    Quotient = LHS;             // X / 1 ===> X
+    Remainder = 0;              // X % 1 ===> 0
+  }
+
+  if (LHS.ult(RHS)) {
+    Remainder = LHS.getZExtValue(); // X % Y ===> X, iff X < Y
+    Quotient = 0;                   // X / Y ===> 0, iff X < Y
+    return;
+  }
+
+  if (LHS == RHS) {
+    Quotient  = 1;              // X / X ===> 1
+    Remainder = 0;              // X % X ===> 0;
+    return;
+  }
+
+  // Make sure there is enough space to hold the results.
+  // NOTE: This assumes that reallocate won't affect any bits if it doesn't
+  // change the size. This is necessary if Quotient is aliased with LHS.
+  Quotient.reallocate(BitWidth);
+
+  if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
+    // There is only one word to consider so use the native versions.
+    uint64_t lhsValue = LHS.U.pVal[0];
+    Quotient = lhsValue / RHS;
+    Remainder = lhsValue % RHS;
+    return;
+  }
+
+  // Okay, lets do it the long way
+  divide(LHS.U.pVal, lhsWords, &RHS, 1, Quotient.U.pVal, &Remainder);
+  // Clear the rest of the Quotient.
+  std::memset(Quotient.U.pVal + lhsWords, 0,
+              (getNumWords(BitWidth) - lhsWords) * APINT_WORD_SIZE);
 }
 
 void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
@@ -1732,6 +1855,26 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
   }
 }
 
+void APInt::sdivrem(const APInt &LHS, int64_t RHS,
+                    APInt &Quotient, int64_t &Remainder) {
+  uint64_t R = Remainder;
+  if (LHS.isNegative()) {
+    if (RHS < 0)
+      APInt::udivrem(-LHS, -RHS, Quotient, R);
+    else {
+      APInt::udivrem(-LHS, RHS, Quotient, R);
+      Quotient.negate();
+    }
+    R = -R;
+  } else if (RHS < 0) {
+    APInt::udivrem(LHS, -RHS, Quotient, R);
+    Quotient.negate();
+  } else {
+    APInt::udivrem(LHS, RHS, Quotient, R);
+  }
+  Remainder = R;
+}
+
 APInt APInt::sadd_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this+RHS;
   Overflow = isNonNegative() == RHS.isNonNegative() &&
@@ -1962,11 +2105,9 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
       Tmp.lshrInPlace(ShiftAmt);
     }
   } else {
-    APInt divisor(Tmp.getBitWidth(), Radix);
-    APInt APdigit;
     while (Tmp.getBoolValue()) {
-      udivrem(Tmp, divisor, Tmp, APdigit);
-      unsigned Digit = (unsigned)APdigit.getZExtValue();
+      uint64_t Digit;
+      udivrem(Tmp, Radix, Tmp, Digit);
       assert(Digit < Radix && "divide failed");
       Str.push_back(Digits[Digit]);
     }
diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp
index 702d98770e05..5c277448a765 100644
--- a/lib/Support/BinaryStreamReader.cpp
+++ b/lib/Support/BinaryStreamReader.cpp
@@ -13,9 +13,18 @@
 #include "llvm/Support/BinaryStreamRef.h"
 
 using namespace llvm;
+using endianness = llvm::support::endianness;
 
-BinaryStreamReader::BinaryStreamReader(BinaryStreamRef S)
-    : Stream(S), Offset(0) {}
+BinaryStreamReader::BinaryStreamReader(BinaryStreamRef Ref) : Stream(Ref) {}
+
+BinaryStreamReader::BinaryStreamReader(BinaryStream &Stream) : Stream(Stream) {}
+
+BinaryStreamReader::BinaryStreamReader(ArrayRef<uint8_t> Data,
+                                       endianness Endian)
+    : Stream(Data, Endian) {}
+
+BinaryStreamReader::BinaryStreamReader(StringRef Data, endianness Endian)
+    : Stream(Data, Endian) {}
 
 Error BinaryStreamReader::readLongestContiguousChunk(
     ArrayRef<uint8_t> &Buffer) {
@@ -86,6 +95,11 @@ Error BinaryStreamReader::skip(uint32_t Amount) {
   return Error::success();
 }
 
+Error BinaryStreamReader::padToAlignment(uint32_t Align) {
+  uint32_t NewOffset = alignTo(Offset, Align);
+  return skip(NewOffset - Offset);
+}
+
 uint8_t BinaryStreamReader::peek() const {
   ArrayRef<uint8_t> Buffer;
   auto EC = Stream.readBytes(Offset, 1, Buffer);
diff --git a/lib/Support/BinaryStreamRef.cpp b/lib/Support/BinaryStreamRef.cpp
new file mode 100644
index 000000000000..fe9a8171e146
--- /dev/null
+++ b/lib/Support/BinaryStreamRef.cpp
@@ -0,0 +1,137 @@
+//===- BinaryStreamRef.cpp - ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryByteStream.h"
+
+using namespace llvm;
+using namespace llvm::support;
+
+namespace {
+
+class ArrayRefImpl : public BinaryStream {
+public:
+  ArrayRefImpl(ArrayRef<uint8_t> Data, endianness Endian) : BBS(Data, Endian) {}
+
+  llvm::support::endianness getEndian() const override {
+    return BBS.getEndian();
+  }
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readBytes(Offset, Size, Buffer);
+  }
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readLongestContiguousChunk(Offset, Buffer);
+  }
+  uint32_t getLength() override { return BBS.getLength(); }
+
+private:
+  BinaryByteStream BBS;
+};
+
+class MutableArrayRefImpl : public WritableBinaryStream {
+public:
+  MutableArrayRefImpl(MutableArrayRef<uint8_t> Data, endianness Endian)
+      : BBS(Data, Endian) {}
+
+  // Inherited via WritableBinaryStream
+  llvm::support::endianness getEndian() const override {
+    return BBS.getEndian();
+  }
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readBytes(Offset, Size, Buffer);
+  }
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readLongestContiguousChunk(Offset, Buffer);
+  }
+  uint32_t getLength() override { return BBS.getLength(); }
+
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) override {
+    return BBS.writeBytes(Offset, Data);
+  }
+  Error commit() override { return BBS.commit(); }
+
+private:
+  MutableBinaryByteStream BBS;
+};
+}
+
+BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream)
+    : BinaryStreamRef(Stream, 0, Stream.getLength()) {}
+BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream, uint32_t Offset,
+                                 uint32_t Length)
+    : BinaryStreamRefBase(Stream, Offset, Length) {}
+BinaryStreamRef::BinaryStreamRef(ArrayRef<uint8_t> Data, endianness Endian)
+    : BinaryStreamRefBase(std::make_shared<ArrayRefImpl>(Data, Endian), 0,
+                          Data.size()) {}
+BinaryStreamRef::BinaryStreamRef(StringRef Data, endianness Endian)
+    : BinaryStreamRef(makeArrayRef(Data.bytes_begin(), Data.bytes_end()),
+                      Endian) {}
+
+BinaryStreamRef::BinaryStreamRef(const BinaryStreamRef &Other)
+    : BinaryStreamRefBase(Other) {}
+
+Error BinaryStreamRef::readBytes(uint32_t Offset, uint32_t Size,
+                                 ArrayRef<uint8_t> &Buffer) const {
+  if (auto EC = checkOffset(Offset, Size))
+    return EC;
+  return BorrowedImpl->readBytes(ViewOffset + Offset, Size, Buffer);
+}
+
+Error BinaryStreamRef::readLongestContiguousChunk(
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+  if (auto EC = checkOffset(Offset, 1))
+    return EC;
+
+  if (auto EC =
+          BorrowedImpl->readLongestContiguousChunk(ViewOffset + Offset, Buffer))
+    return EC;
+  // This StreamRef might refer to a smaller window over a larger stream.  In
+  // that case we will have read out more bytes than we should return, because
+  // we should not read past the end of the current view.
+  uint32_t MaxLength = Length - Offset;
+  if (Buffer.size() > MaxLength)
+    Buffer = Buffer.slice(0, MaxLength);
+  return Error::success();
+}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(WritableBinaryStream &Stream)
+    : WritableBinaryStreamRef(Stream, 0, Stream.getLength()) {}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(WritableBinaryStream &Stream,
+                                                 uint32_t Offset,
+                                                 uint32_t Length)
+    : BinaryStreamRefBase(Stream, Offset, Length) {}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(MutableArrayRef<uint8_t> Data,
+                                                 endianness Endian)
+    : BinaryStreamRefBase(std::make_shared<MutableArrayRefImpl>(Data, Endian),
+                          0, Data.size()) {}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(
+    const WritableBinaryStreamRef &Other)
+    : BinaryStreamRefBase(Other) {}
+
+Error WritableBinaryStreamRef::writeBytes(uint32_t Offset,
+                                          ArrayRef<uint8_t> Data) const {
+  if (auto EC = checkOffset(Offset, Data.size()))
+    return EC;
+
+  return BorrowedImpl->writeBytes(ViewOffset + Offset, Data);
+}
+
+WritableBinaryStreamRef::operator BinaryStreamRef() const {
+  return BinaryStreamRef(*BorrowedImpl, ViewOffset, Length);
+}
+
+/// \brief For buffered streams, commits changes to the backing store.
+Error WritableBinaryStreamRef::commit() { return BorrowedImpl->commit(); }
diff --git a/lib/Support/BinaryStreamWriter.cpp b/lib/Support/BinaryStreamWriter.cpp
index d78dbc68f593..b22eb1ed12d0 100644
--- a/lib/Support/BinaryStreamWriter.cpp
+++ b/lib/Support/BinaryStreamWriter.cpp
@@ -15,8 +15,15 @@
 
 using namespace llvm;
 
-BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStreamRef S)
-    : Stream(S), Offset(0) {}
+BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStreamRef Ref)
+    : Stream(Ref) {}
+
+BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStream &Stream)
+    : Stream(Stream) {}
+
+BinaryStreamWriter::BinaryStreamWriter(MutableArrayRef<uint8_t> Data,
+                                       llvm::support::endianness Endian)
+    : Stream(Data, Endian) {}
 
 Error BinaryStreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
   if (auto EC = Stream.writeBytes(Offset, Buffer))
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 83376284548f..a12ba4fbfda8 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -39,6 +39,7 @@ add_llvm_library(LLVMSupport
   Allocator.cpp
   BinaryStreamError.cpp
   BinaryStreamReader.cpp
+  BinaryStreamRef.cpp
   BinaryStreamWriter.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
diff --git a/lib/Support/FormattedStream.cpp b/lib/Support/FormattedStream.cpp
index 2ed71c7e4311..c01659604444 100644
--- a/lib/Support/FormattedStream.cpp
+++ b/lib/Support/FormattedStream.cpp
@@ -32,6 +32,7 @@ static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *
     switch (*Ptr) {
     case '\n':
       Line += 1;
+      LLVM_FALLTHROUGH;
     case '\r':
       Column = 0;
       break;
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index eb8108908ac5..b0e3d6898cae 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -1472,6 +1472,24 @@ bool Triple::isLittleEndian() const {
   }
 }
 
+bool Triple::isCompatibleWith(const Triple &Other) const {
+  // If vendor is apple, ignore the version number.
+  if (getVendor() == Triple::Apple)
+    return getArch() == Other.getArch() && getSubArch() == Other.getSubArch() &&
+           getVendor() == Other.getVendor() && getOS() == Other.getOS();
+
+  return *this == Other;
+}
+
+std::string Triple::merge(const Triple &Other) const {
+  // If vendor is apple, pick the triple with the larger version number.
+  if (getVendor() == Triple::Apple)
+    if (Other.isOSVersionLT(*this))
+      return str();
+
+  return Other.str();
+}
+
 StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   if (MArch.empty())
     MArch = getArchName();
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 5ddf66654a67..da68f3165c5e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -315,8 +315,14 @@ def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 // AArch64 Instruction Predicate Definitions.
 def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize   : Predicate<"Subtarget->getForCodeSize()">;
-def NotForCodeSize   : Predicate<"!Subtarget->getForCodeSize()">;
+
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+  def ForCodeSize   : Predicate<"MF->getFunction()->optForSize()">;
+  def NotForCodeSize   : Predicate<"!MF->getFunction()->optForSize()">;
+}
 
 include "AArch64InstrFormats.td"
 
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 1c81d34014fd..b369ee7e4ba2 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -164,12 +164,11 @@ struct AArch64GISelActualAccessor : public GISelAccessor {
 
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
-                                   const TargetMachine &TM, bool LittleEndian,
-                                   bool ForCodeSize)
+                                   const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
       IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
-      TLInfo(TM, *this), GISel(), ForCodeSize(ForCodeSize) {
+      TLInfo(TM, *this), GISel() {
 #ifndef LLVM_BUILD_GLOBAL_ISEL
   GISelAccessor *AArch64GISel = new GISelAccessor();
 #else
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index df54bf3f48e1..7933e58c49ee 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -128,8 +128,6 @@ protected:
   /// an optional library.
   std::unique_ptr<GISelAccessor> GISel;
 
-  bool ForCodeSize;
-
 private:
   /// initializeSubtargetDependencies - Initializes using CPUString and the
   /// passed in feature string so that we can use initializer lists for
@@ -145,7 +143,7 @@ public:
   /// of the specified triple.
   AArch64Subtarget(const Triple &TT, const std::string &CPU,
                    const std::string &FS, const TargetMachine &TM,
-                   bool LittleEndian, bool ForCodeSize);
+                   bool LittleEndian);
 
   /// This object will take onwership of \p GISelAccessor.
   void setGISelAccessor(GISelAccessor &GISel) {
@@ -274,8 +272,6 @@ public:
     }
   }
 
-  bool getForCodeSize() const { return ForCodeSize; }
-
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 5a90fd1eb1ba..132f192f2a9a 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -214,7 +214,6 @@ const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
-  bool ForCodeSize = F.optForSize();
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -222,17 +221,15 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   std::string FS = !FSAttr.hasAttribute(Attribute::None)
                        ? FSAttr.getValueAsString().str()
                        : TargetFS;
-  std::string ForCodeSizeStr =
-      std::string(ForCodeSize ? "+" : "-") + "forcodesize";
 
-  auto &I = SubtargetMap[CPU + FS + ForCodeSizeStr];
+  auto &I = SubtargetMap[CPU + FS];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
-                                            isLittle, ForCodeSize);
+                                            isLittle);
   }
   return I.get();
 }
@@ -324,7 +321,7 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
 void AArch64PassConfig::addIRPasses() {
   // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
   // ourselves.
-  addPass(createAtomicExpandPass(TM));
+  addPass(createAtomicExpandPass());
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -343,7 +340,7 @@ void AArch64PassConfig::addIRPasses() {
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createInterleavedAccessPass(TM));
+    addPass(createInterleavedAccessPass());
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 3f89702bed50..78ff3bbe3d1a 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -27,12 +27,12 @@ class PassRegistry;
 class Module;
 
 // R600 Passes
-FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600VectorRegMerger();
+FunctionPass *createR600ExpandSpecialInstrsPass();
 FunctionPass *createR600EmitClauseMarkers();
-FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
-FunctionPass *createR600Packetizer(TargetMachine &tm);
-FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createR600ClauseMergePass();
+FunctionPass *createR600Packetizer();
+FunctionPass *createR600ControlFlowFinalizer();
 FunctionPass *createAMDGPUCFGStructurizerPass();
 
 // SI Passes
@@ -42,24 +42,24 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
-FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 
 void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
 extern char &AMDGPUMachineCFGStructurizerID;
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
-ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr);
+ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
@@ -97,7 +97,7 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingID;
 
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 3d8db7cd8af5..7235d8fae332 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -27,7 +28,6 @@ namespace {
 
 class AMDGPUAnnotateKernelFeatures : public ModulePass {
 private:
-  const TargetMachine *TM;
   AMDGPUAS AS;
   static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
 
@@ -37,8 +37,7 @@ private:
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) :
-                               ModulePass(ID), TM(TM_) {}
+  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {}
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
@@ -221,8 +220,10 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
       if (F.hasFnAttribute("amdgpu-queue-ptr"))
         continue;
 
-      bool HasApertureRegs =
-        TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs();
+      auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+      bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>()
+                                        .getSubtarget<AMDGPUSubtarget>(F)
+                                        .hasApertureRegs();
       if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
         F.addFnAttr("amdgpu-queue-ptr");
     }
@@ -231,6 +232,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
   return Changed;
 }
 
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) {
-  return new AMDGPUAnnotateKernelFeatures(TM);
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+  return new AMDGPUAnnotateKernelFeatures();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 09bdf8ffcde7..251cb7a2c440 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -38,7 +38,8 @@ class AMDGPUCallLowering: public CallLowering {
                    unsigned VReg) const override;
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+  static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 };
 } // End of namespace llvm;
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index d308f718aae1..4bef7a89bfe3 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -13,6 +13,8 @@
 
 // Inversion of CCIfInReg
 class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+class CCIfExtend<CCAction A>
+  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 // Calling convention for SI
 def CC_SI : CallingConv<[
@@ -52,7 +54,7 @@ def CC_SI : CallingConv<[
   ]>>>
 ]>;
 
-def RetCC_SI : CallingConv<[
+def RetCC_SI_Shader : CallingConv<[
   CCIfType<[i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@@ -99,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[
   CCCustom<"allocateKernArg">
 ]>;
 
+def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
+  (sequence "VGPR%u", 24, 255)
+>;
+
+def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
+  (sequence "VGPR%u", 32, 255)
+>;
+
+def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
+  (sequence "SGPR%u", 32, 103)
+>;
+
+def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
+  (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
+>;
+
+// Calling convention for leaf functions
+def CC_AMDGPU_Func : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfType<[i1], CCPromoteToType<i32>>,
+  CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+// Calling convention for leaf functions
+def RetCC_AMDGPU_Func : CallingConv<[
+  CCIfType<[i1], CCPromoteToType<i32>>,
+  CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+]>;
+
 def CC_AMDGPU : CallingConv<[
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >="
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e19314fe0a6c..d923cb117c12 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -48,7 +49,6 @@ namespace {
 
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
-  const GCNTargetMachine *TM;
   const SISubtarget *ST = nullptr;
   DivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
@@ -127,8 +127,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
 public:
   static char ID;
 
-  AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
-    FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {}
+  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
 
   bool visitFDiv(BinaryOperator &I);
 
@@ -487,10 +486,15 @@ bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
 }
 
 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
-  if (!TM || skipFunction(F))
+  if (skipFunction(F))
     return false;
 
-  ST = &TM->getSubtarget<SISubtarget>(F);
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  ST = &TM.getSubtarget<SISubtarget>(F);
   DA = &getAnalysis<DivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
 
@@ -507,14 +511,14 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
-INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
-                       "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
+                    false, false)
 
 char AMDGPUCodeGenPrepare::ID = 0;
 
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
-  return new AMDGPUCodeGenPrepare(TM);
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
+  return new AMDGPUCodeGenPrepare();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c3ac796a0a44..19fce064783d 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -82,7 +82,7 @@ public:
   void PostprocessISelDAG() override;
 
 private:
-  SDValue foldFrameIndex(SDValue N) const;
+  std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
   bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
@@ -116,9 +116,11 @@ private:
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
-  bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+  bool SelectMUBUFScratchOffen(SDNode *Root,
+                               SDValue Addr, SDValue &RSrc, SDValue &VAddr,
                                SDValue &SOffset, SDValue &ImmOffset) const;
-  bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+  bool SelectMUBUFScratchOffset(SDNode *Root,
+                                SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                                 SDValue &Offset) const;
 
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
@@ -1074,13 +1076,33 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
 }
 
-SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
-  if (auto FI = dyn_cast<FrameIndexSDNode>(N))
-    return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
-  return N;
+static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
+  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
+  return PSV && PSV->isStack();
+}
+
+std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (auto FI = dyn_cast<FrameIndexSDNode>(N)) {
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+                                              FI->getValueType(0));
+
+    // If we can resolve this to a frame index access, this is relative to the
+    // frame pointer SGPR.
+    return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(),
+                                                   MVT::i32));
+  }
+
+  // If we don't know this private access is a local stack object, it needs to
+  // be relative to the entry point's scratch wave offset register.
+  return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
+                                               MVT::i32));
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
+                                                 SDValue Addr, SDValue &Rsrc,
                                                  SDValue &VAddr, SDValue &SOffset,
                                                  SDValue &ImmOffset) const {
 
@@ -1089,7 +1111,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
-  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned Imm = CAddr->getZExtValue();
@@ -1100,6 +1121,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
     MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
                                                         DL, MVT::i32, HighBits);
     VAddr = SDValue(MovHighBits, 0);
+
+    // In a call sequence, stores to the argument stack area are relative to the
+    // stack pointer.
+    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+    unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
+      Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
+
+    SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
     ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
     return true;
   }
@@ -1113,19 +1142,20 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
     // Offsets in vaddr must be positive.
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
     if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = foldFrameIndex(N0);
+      std::tie(VAddr, SOffset) = foldFrameIndex(N0);
       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
     }
   }
 
   // (node)
-  VAddr = foldFrameIndex(Addr);
+  std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
+                                                  SDValue Addr,
                                                   SDValue &SRsrc,
                                                   SDValue &SOffset,
                                                   SDValue &Offset) const {
@@ -1138,7 +1168,15 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr,
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
-  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
+
+  const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+  unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
+    Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
+
+  // FIXME: Get from MachinePointerInfo? We should only be using the frame
+  // offset if we know this is in a call sequence.
+  SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+
   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
   return true;
 }
@@ -1700,12 +1738,46 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
   return true;
 }
 
+static SDValue stripBitcast(SDValue Val) {
+  return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+}
+
+// Figure out if this is really an extract of the high 16-bits of a dword.
+static bool isExtractHiElt(SDValue In, SDValue &Out) {
+  In = stripBitcast(In);
+  if (In.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue Srl = In.getOperand(0);
+  if (Srl.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+      if (ShiftAmt->getZExtValue() == 16) {
+        Out = stripBitcast(Srl.getOperand(0));
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+// Look through operations that obscure just looking at the low 16-bits of the
+// same register.
+static SDValue stripExtractLoElt(SDValue In) {
+  if (In.getOpcode() == ISD::TRUNCATE) {
+    SDValue Src = In.getOperand(0);
+    if (Src.getValueType().getSizeInBits() == 32)
+      return stripBitcast(Src);
+  }
+
+  return In;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   unsigned Mods = 0;
   Src = In;
 
-  // FIXME: Look for on separate components
   if (Src.getOpcode() == ISD::FNEG) {
     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
     Src = Src.getOperand(0);
@@ -1714,19 +1786,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
   if (Src.getOpcode() == ISD::BUILD_VECTOR) {
     unsigned VecMods = Mods;
 
-    SDValue Lo = Src.getOperand(0);
-    SDValue Hi = Src.getOperand(1);
+    SDValue Lo = stripBitcast(Src.getOperand(0));
+    SDValue Hi = stripBitcast(Src.getOperand(1));
 
     if (Lo.getOpcode() == ISD::FNEG) {
-      Lo = Lo.getOperand(0);
+      Lo = stripBitcast(Lo.getOperand(0));
       Mods ^= SISrcMods::NEG;
     }
 
     if (Hi.getOpcode() == ISD::FNEG) {
-      Hi = Hi.getOperand(0);
+      Hi = stripBitcast(Hi.getOperand(0));
       Mods ^= SISrcMods::NEG_HI;
     }
 
+    if (isExtractHiElt(Lo, Lo))
+      Mods |= SISrcMods::OP_SEL_0;
+
+    if (isExtractHiElt(Hi, Hi))
+      Mods |= SISrcMods::OP_SEL_1;
+
+    Lo = stripExtractLoElt(Lo);
+    Hi = stripExtractLoElt(Hi);
+
     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
       // Really a scalar input. Just select from the low half of the register to
       // avoid packing.
@@ -1740,9 +1821,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
   }
 
   // Packed instructions do not have abs modifiers.
-
-  // FIXME: Handle abs/neg of individual components.
-  // FIXME: Handle swizzling with op_sel
   Mods |= SISrcMods::OP_SEL_1;
 
   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f80652b87373..5ec46a8294c0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -76,6 +76,45 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
   }
 }
 
+// Allocate up to VGPR31.
+//
+// TODO: Since there are no VGPR alignent requirements would it be better to
+// split into individual scalar registers?
+static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v2f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_64RegClass, 31);
+  }
+  case MVT::v4i32:
+  case MVT::v4f32:
+  case MVT::v2i64:
+  case MVT::v2f64: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_128RegClass, 29);
+  }
+  case MVT::v8i32:
+  case MVT::v8f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_256RegClass, 25);
+
+  }
+  case MVT::v16i32:
+  case MVT::v16f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_512RegClass, 17);
+
+  }
+  default:
+    return false;
+  }
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 // Find a larger type to do a load / store of a vector with.
@@ -773,8 +812,43 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 //===---------------------------------------------------------------------===//
 
 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
-                                                  bool IsVarArg) const {
-  return CC_AMDGPU;
+                                                  bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return CC_AMDGPU;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CC_AMDGPU_Func;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                    bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return RetCC_SI_Shader;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return RetCC_AMDGPU_Func;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
 }
 
 /// The SelectionDAGBuilder will automatically promote function arguments
@@ -874,18 +948,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
   }
 }
 
-void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
-                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
-
-  State.AnalyzeReturn(Outs, RetCC_SI);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                  bool isVarArg,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SDLoc &DL, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerReturn(
+  SDValue Chain, CallingConv::ID CallConv,
+  bool isVarArg,
+  const SmallVectorImpl<ISD::OutputArg> &Outs,
+  const SmallVectorImpl<SDValue> &OutVals,
+  const SDLoc &DL, SelectionDAG &DAG) const {
+  // FIXME: Fails for r600 tests
+  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
+  // "wave terminate should not have return values");
   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 }
 
@@ -896,20 +967,12 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 /// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                     bool IsVarArg) {
-  switch (CC) {
-  case CallingConv::C:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return CC_AMDGPU_Kernel;
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    return CC_AMDGPU;
-  default:
-    report_fatal_error("Unsupported calling convention.");
-  }
+  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
+}
+
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                      bool IsVarArg) {
+  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
 }
 
 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
@@ -2532,27 +2595,49 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
 
 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
-  if (N->getValueType(0) != MVT::i64)
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
-
-  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
-  // common case, splitting this into a move and a 32-bit shift is faster and
-  // the same code size.
-  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!RHS)
     return SDValue();
 
-  unsigned RHSVal = RHS->getZExtValue();
-  if (RHSVal < 32)
-    return SDValue();
-
   SDValue LHS = N->getOperand(0);
+  unsigned RHSVal = RHS->getZExtValue();
+  if (!RHSVal)
+    return LHS;
 
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
 
+  switch (LHS->getOpcode()) {
+  default:
+    break;
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND: {
+    // shl (ext x) => zext (shl x), if shift does not overflow int
+    KnownBits Known;
+    SDValue X = LHS->getOperand(0);
+    DAG.computeKnownBits(X, Known);
+    unsigned LZ = Known.countMinLeadingZeros();
+    if (LZ < RHSVal)
+      break;
+    EVT XVT = X.getValueType();
+    SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
+    return DAG.getZExtOrTrunc(Shl, SL, VT);
+  }
+  }
+
+  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
+  if (RHSVal < 32)
+    return SDValue();
+
   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
 
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 4c588a7bafd0..fb2f15022d25 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -115,9 +115,6 @@ protected:
                                     SmallVectorImpl<SDValue> &Results) const;
   void analyzeFormalArgumentsCompute(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
-  void AnalyzeReturn(CCState &State,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
-
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
@@ -164,6 +161,8 @@ public:
   bool isCheapToSpeculateCtlz() const override;
 
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+  static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
+
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 353cc5742791..e286558ce60d 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -380,6 +380,6 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
   [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
 >;
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index dcb6670621ee..846e7dff5f8c 100644
--- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -9,6 +9,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -25,15 +26,13 @@ const unsigned MaxStaticSize = 1024;
 
 class AMDGPULowerIntrinsics : public ModulePass {
 private:
-  const TargetMachine *TM;
-
   bool makeLIDRangeMetadata(Function &F) const;
 
 public:
   static char ID;
 
-  AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr)
-    : ModulePass(ID), TM(TM) { }
+  AMDGPULowerIntrinsics() : ModulePass(ID) {}
+
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override {
     return "AMDGPU Lower Intrinsics";
@@ -46,8 +45,8 @@ char AMDGPULowerIntrinsics::ID = 0;
 
 char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
 
-INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
-                  "Lower intrinsics", false, false)
+INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
+                false)
 
 // TODO: Should refine based on estimated number of accesses (e.g. does it
 // require splitting based on alignment)
@@ -104,11 +103,13 @@ static bool expandMemIntrinsicUses(Function &F) {
 }
 
 bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
-  if (!TM)
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
     return false;
 
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
   bool Changed = false;
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
 
   for (auto *U : F.users()) {
     auto *CI = dyn_cast<CallInst>(U);
@@ -155,6 +156,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
   return Changed;
 }
 
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) {
-  return new AMDGPULowerIntrinsics(TM);
+ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
+  return new AMDGPULowerIntrinsics();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index da247fea7de6..f1ef6281c90f 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -126,9 +126,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
 }
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  unsigned Opcode = MI->getOpcode();
 
-  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+  // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
+  // need to select it to the subtarget specific version, and there's no way to
+  // do that with a single pseudo source operation.
+  if (Opcode == AMDGPU::S_SETPC_B64_return)
+    Opcode = AMDGPU::S_SETPC_B64;
 
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
   if (MCOpcode == -1) {
     LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
     C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index fe7283ccf7d9..9fb7f5f88927 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -12,21 +12,6 @@
 
 using namespace llvm;
 
-static bool isEntryFunctionCC(CallingConv::ID CC) {
-  switch (CC) {
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-  case CallingConv::AMDGPU_VS:
-  case CallingConv::AMDGPU_HS:
-  case CallingConv::AMDGPU_GS:
-  case CallingConv::AMDGPU_PS:
-  case CallingConv::AMDGPU_CS:
-    return true;
-  default:
-    return false;
-  }
-}
-
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
@@ -34,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
-  IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+  IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
   NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e40f39557747..85184b363905 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -99,8 +100,7 @@ private:
 public:
   static char ID;
 
-  AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
-    FunctionPass(ID), TM(TM_) {}
+  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -119,30 +119,31 @@ public:
 
 char AMDGPUPromoteAlloca::ID = 0;
 
-INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
-                   "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                "AMDGPU promote alloca to vector or LDS", false, false)
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
-  if (!TM)
-    return false;
-
   Mod = &M;
   DL = &Mod->getDataLayout();
 
-  const Triple &TT = TM->getTargetTriple();
-
-  IsAMDGCN = TT.getArch() == Triple::amdgcn;
-  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
-
   return false;
 }
 
 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
-  if (!TM || skipFunction(F))
+  if (skipFunction(F))
     return false;
 
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    TM = &TPC->getTM<TargetMachine>();
+  else
+    return false;
+
+  const Triple &TT = TM->getTargetTriple();
+  IsAMDGCN = TT.getArch() == Triple::amdgcn;
+  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
@@ -874,6 +875,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
   }
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
-  return new AMDGPUPromoteAlloca(TM);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+  return new AMDGPUPromoteAlloca();
 }
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 941f2d8a468a..b2867fcc49f9 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIRegisterInfo.h"
 
 using namespace llvm;
 
@@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-// Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
-
-const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
-  const MachineFunction *) const {
-  return &CalleeSavedReg;
-}
-
-unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
-}
-
 unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   static const unsigned SubRegs[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
@@ -50,3 +39,35 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
 
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
+
+
+// Forced to be here by one .inc
+const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *MF) const {
+  CallingConv::ID CC = MF->getFunction()->getCallingConv();
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CSR_AMDGPU_HighRegs_SaveList;
+  default: {
+    // Dummy to not crash RegisterClassInfo.
+    static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
+    return &NoCalleeSavedReg;
+  }
+  }
+}
+
+const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                                     CallingConv::ID CC) const {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CSR_AMDGPU_HighRegs_RegMask;
+  default:
+    return nullptr;
+  }
+}
+
+unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 22b1663821d9..d8604d2590f1 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -30,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
-
-  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 386a88b0520f..a9d3a31a7240 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -570,7 +570,7 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
-  addPass(createAMDGPULowerIntrinsicsPass(&TM));
+  addPass(createAMDGPULowerIntrinsicsPass());
 
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
@@ -585,8 +585,7 @@ void AMDGPUPassConfig::addIRPasses() {
   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
     // TODO: May want to move later or split into an early and late one.
 
-    addPass(createAMDGPUCodeGenPreparePass(
-              static_cast<const GCNTargetMachine *>(&TM)));
+    addPass(createAMDGPUCodeGenPreparePass());
   }
 
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
@@ -594,7 +593,7 @@ void AMDGPUPassConfig::addIRPasses() {
 
   if (TM.getOptLevel() > CodeGenOpt::None) {
     addPass(createInferAddressSpacesPass());
-    addPass(createAMDGPUPromoteAlloca(&TM));
+    addPass(createAMDGPUPromoteAlloca());
 
     if (EnableSROA)
       addPass(createSROAPass());
@@ -664,22 +663,22 @@ bool R600PassConfig::addPreISel() {
 }
 
 void R600PassConfig::addPreRegAlloc() {
-  addPass(createR600VectorRegMerger(*TM));
+  addPass(createR600VectorRegMerger());
 }
 
 void R600PassConfig::addPreSched2() {
   addPass(createR600EmitClauseMarkers(), false);
   if (EnableR600IfConvert)
     addPass(&IfConverterID, false);
-  addPass(createR600ClauseMergePass(*TM), false);
+  addPass(createR600ClauseMergePass(), false);
 }
 
 void R600PassConfig::addPreEmitPass() {
   addPass(createAMDGPUCFGStructurizerPass(), false);
-  addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+  addPass(createR600ExpandSpecialInstrsPass(), false);
   addPass(&FinalizeMachineBundlesID, false);
-  addPass(createR600Packetizer(*TM), false);
-  addPass(createR600ControlFlowFinalizer(*TM), false);
+  addPass(createR600Packetizer(), false);
+  addPass(createR600ControlFlowFinalizer(), false);
 }
 
 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
@@ -703,8 +702,7 @@ bool GCNPassConfig::addPreISel() {
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
-  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
-  addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+  addPass(createAMDGPUAnnotateKernelFeaturesPass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 70c848f3c7bd..b52ea2b3a2c6 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2796,6 +2796,7 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
 void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
   OptionalImmIndexMap OptionalIdx;
 
+  unsigned OperandIdx[4];
   unsigned EnMask = 0;
   int SrcIdx = 0;
 
@@ -2804,15 +2805,18 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
 
     // Add the register arguments
     if (Op.isReg()) {
-      EnMask |= (1 << SrcIdx);
+      assert(SrcIdx < 4);
+      OperandIdx[SrcIdx] = Inst.size();
       Op.addRegOperands(Inst, 1);
       ++SrcIdx;
       continue;
     }
 
     if (Op.isOff()) {
-      ++SrcIdx;
+      assert(SrcIdx < 4);
+      OperandIdx[SrcIdx] = Inst.size();
       Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+      ++SrcIdx;
       continue;
     }
 
@@ -2828,6 +2832,22 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
     OptionalIdx[Op.getImmTy()] = i;
   }
 
+  assert(SrcIdx == 4);
+
+  bool Compr = false;
+  if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) {
+    Compr = true;
+    Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]);
+    Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister);
+    Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister);
+  }
+
+  for (auto i = 0; i < SrcIdx; ++i) {
+    if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) {
+      EnMask |= Compr? (0x3 << i * 2) : (0x1 << i);
+    }
+  }
+
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
 
@@ -3642,6 +3662,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
   {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
   {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+  {"compr", AMDGPUOperand::ImmTyExpCompr, true, nullptr },
   {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
   {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
   {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 89eddb9ce961..2aca65ac8430 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
 
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen">;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [], 20>;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>;
 
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4fb03b62bba9..137b5cca96ce 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -126,6 +126,7 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
   assert(MI.getOpcode() == 0);
   assert(MI.getNumOperands() == 0);
   MCInst TmpInst;
+  HasLiteral = false;
   const auto SavedBytes = Bytes;
   if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
     MI = TmpInst;
@@ -343,10 +344,15 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
-  if (Bytes.size() < 4)
-    return errOperand(0, "cannot read literal, inst bytes left " +
-                         Twine(Bytes.size()));
-  return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+  if (!HasLiteral) {
+    if (Bytes.size() < 4) {
+      return errOperand(0, "cannot read literal, inst bytes left " +
+                        Twine(Bytes.size()));
+    }
+    HasLiteral = true;
+    Literal = eatBytes<uint32_t>(Bytes);
+  }
+  return MCOperand::createImm(Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d50665187e10..620bae0a6d1a 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -39,6 +39,8 @@ class Twine;
 class AMDGPUDisassembler : public MCDisassembler {
 private:
   mutable ArrayRef<uint8_t> Bytes;
+  mutable uint32_t Literal;
+  mutable bool HasLiteral;
 
 public:
   AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 8066428fe44a..18374dca3f84 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNRegPressure.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 
 using namespace llvm;
 
@@ -63,15 +64,6 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
   return true;
 }
 
-static GCNRPTracker::LiveRegSet
-stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
-  GCNRPTracker::LiveRegSet Res;
-  for (const auto &P : LR) {
-    if (P.second.any())
-      Res.insert(P);
-  }
-  return Res;
-}
 #endif
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -185,6 +177,64 @@ void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
 }
 #endif
 
+
+static LaneBitmask getDefRegMask(const MachineOperand &MO,
+                                 const MachineRegisterInfo &MRI) {
+  assert(MO.isDef() && MO.isReg() &&
+    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0 ?
+    MRI.getMaxLaneMaskForVReg(MO.getReg()) :
+    MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+static LaneBitmask getUsedRegMask(const MachineOperand &MO,
+                                  const MachineRegisterInfo &MRI,
+                                  const LiveIntervals &LIS) {
+  assert(MO.isUse() && MO.isReg() &&
+         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  if (auto SubReg = MO.getSubReg())
+    return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+  auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
+  if (MaxMask.getAsInteger() == 1) // cannot have subregs
+    return MaxMask;
+
+  // For a tentative schedule LIS isn't updated yet but livemask should remain
+  // the same on any schedule. Subreg defs can be reordered but they all must
+  // dominate uses anyway.
+  auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+  return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);
+}
+
+SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI,
+                                              const LiveIntervals &LIS,
+                                              const MachineRegisterInfo &MRI) {
+  SmallVector<RegisterMaskPair, 8> Res;
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+    if (!MO.isUse() || !MO.readsReg())
+      continue;
+
+    auto const UsedMask = getUsedRegMask(MO, MRI, LIS);
+
+    auto Reg = MO.getReg();
+    auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) {
+      return RM.RegUnit == Reg;
+    });
+    if (I != Res.end())
+      I->LaneMask |= UsedMask;
+    else
+      Res.push_back(RegisterMaskPair(Reg, UsedMask));
+  }
+  return Res;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRPTracker
 
@@ -222,36 +272,6 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
   return LiveRegs;
 }
 
-LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const {
-  assert(MO.isDef() && MO.isReg() &&
-    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
-
-  // We don't rely on read-undef flag because in case of tentative schedule
-  // tracking it isn't set correctly yet. This works correctly however since
-  // use mask has been tracked before using LIS.
-  return MO.getSubReg() == 0 ?
-    MRI->getMaxLaneMaskForVReg(MO.getReg()) :
-    MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
-}
-
-LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const {
-  assert(MO.isUse() && MO.isReg() &&
-         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
-
-  if (auto SubReg = MO.getSubReg())
-    return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
-
-  auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
-  if (MaxMask.getAsInteger() == 1) // cannot have subregs
-    return MaxMask;
-
-  // For a tentative schedule LIS isn't updated yet but livemask should remain
-  // the same on any schedule. Subreg defs can be reordered but they all must
-  // dominate uses anyway.
-  auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
-  return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
-}
-
 void GCNUpwardRPTracker::reset(const MachineInstr &MI,
                                const LiveRegSet *LiveRegsCopy) {
   MRI = &MI.getParent()->getParent()->getRegInfo();
@@ -272,34 +292,40 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   if (MI.isDebugValue())
     return;
 
-  // process all defs first to ensure early clobbers are handled correctly
-  // iterating over operands() to catch implicit defs
-  for (const auto &MO : MI.operands()) {
-    if (!MO.isReg() || !MO.isDef() ||
-      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      continue;
+  auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
 
-    auto Reg = MO.getReg();
-    auto &LiveMask = LiveRegs[Reg];
-    auto PrevMask = LiveMask;
-    LiveMask &= ~getDefRegMask(MO);
-    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  // calc pressure at the MI (defs + uses)
+  auto AtMIPressure = CurPressure;
+  for (const auto &U : RegUses) {
+    auto LiveMask = LiveRegs[U.RegUnit];
+    AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI);
   }
+  // update max pressure
+  MaxPressure = max(AtMIPressure, MaxPressure);
 
-  // then all uses
-  for (const auto &MO : MI.uses()) {
-    if (!MO.isReg() || !MO.readsReg() ||
-      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  for (const auto &MO : MI.defs()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
+         MO.isDead())
       continue;
 
     auto Reg = MO.getReg();
-    auto &LiveMask = LiveRegs[Reg];
+    auto I = LiveRegs.find(Reg);
+    if (I == LiveRegs.end())
+      continue;
+    auto &LiveMask = I->second;
     auto PrevMask = LiveMask;
-    LiveMask |= getUsedRegMask(MO);
+    LiveMask &= ~getDefRegMask(MO, *MRI);
     CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+    if (LiveMask.none())
+      LiveRegs.erase(I);
   }
-
-  MaxPressure = max(MaxPressure, CurPressure);
+  for (const auto &U : RegUses) {
+    auto &LiveMask = LiveRegs[U.RegUnit];
+    auto PrevMask = LiveMask;
+    LiveMask |= U.LaneMask;
+    CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+  }
+  assert(CurPressure == getRegPressure(*MRI, LiveRegs));
 }
 
 bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
@@ -368,7 +394,7 @@ void GCNDownwardRPTracker::advanceToNext() {
       continue;
     auto &LiveMask = LiveRegs[Reg];
     auto PrevMask = LiveMask;
-    LiveMask |= getDefRegMask(MO);
+    LiveMask |= getDefRegMask(MO, *MRI);
     CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
   }
 
@@ -430,7 +456,7 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
 bool GCNUpwardRPTracker::isValid() const {
   const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
   const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
-  const auto TrackedLR = stripEmpty(LiveRegs);
+  const auto &TrackedLR = LiveRegs;
 
   if (!isEqual(LISLR, TrackedLR)) {
     dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 9875ca6a6d16..5dfe44053e72 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -98,8 +98,6 @@ protected:
   const MachineInstr *LastTrackedMI = nullptr;
   mutable const MachineRegisterInfo *MRI = nullptr;
   GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
-  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
-  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
 public:
   // live regs for the current state
   const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index d0aba38f786d..fbe45cb222d9 100644
--- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -62,7 +62,7 @@ private:
                        const MachineInstr &LatrCFAlu) const;
 
 public:
-  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+  R600ClauseMergePass() : MachineFunctionPass(ID) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -208,6 +208,6 @@ StringRef R600ClauseMergePass::getPassName() const {
 } // end anonymous namespace
 
 
-llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
-  return new R600ClauseMergePass(TM);
+llvm::FunctionPass *llvm::createR600ClauseMergePass() {
+  return new R600ClauseMergePass();
 }
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 811b905588b4..09b328765604 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -499,7 +499,7 @@ private:
   }
 
 public:
-  R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID) {}
+  R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     ST = &MF.getSubtarget<R600Subtarget>();
@@ -706,6 +706,6 @@ char R600ControlFlowFinalizer::ID = 0;
 
 } // end anonymous namespace
 
-FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
-  return new R600ControlFlowFinalizer(TM);
+FunctionPass *llvm::createR600ControlFlowFinalizer() {
+  return new R600ControlFlowFinalizer();
 }
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 3e46e6387614..5c30a0734f0d 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -37,7 +37,7 @@ private:
       unsigned Op);
 
 public:
-  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+  R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID),
     TII(nullptr) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -51,8 +51,8 @@ public:
 
 char R600ExpandSpecialInstrsPass::ID = 0;
 
-FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
-  return new R600ExpandSpecialInstrsPass(TM);
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass() {
+  return new R600ExpandSpecialInstrsPass();
 }
 
 void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index d90008a550ae..502dd3bce97e 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -124,7 +124,7 @@ private:
 public:
   static char ID;
 
-  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  R600VectorRegMerger() : MachineFunctionPass(ID),
   TII(nullptr) { }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -396,6 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
   return false;
 }
 
-llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
-  return new R600VectorRegMerger(tm);
+llvm::FunctionPass *llvm::createR600VectorRegMerger() {
+  return new R600VectorRegMerger();
 }
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index 5b6dd1ed128d..3e957126b497 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -36,7 +36,7 @@ class R600Packetizer : public MachineFunctionPass {
 
 public:
   static char ID;
-  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+  R600Packetizer() : MachineFunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -404,6 +404,6 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
 } // end anonymous namespace
 
-llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
-  return new R600Packetizer(tm);
+llvm::FunctionPass *llvm::createR600Packetizer() {
+  return new R600Packetizer();
 }
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index dfdc602b80cd..7501facb0cba 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *) const {
+  return &CalleeSavedReg;
+}
+
+unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
+
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h
index 9dfb3106c6cc..f0d9644b02f2 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
   R600RegisterInfo();
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 1279f845de0e..97bb0f0c0656 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -189,8 +189,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   // ----
   //  13 (+1)
   unsigned ReservedRegCount = 13;
-  if (SPReg != AMDGPU::NoRegister)
-    ++ReservedRegCount;
 
   if (AllSGPRs.size() < ReservedRegCount)
     return std::make_pair(ScratchWaveOffsetReg, SPReg);
@@ -208,13 +206,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
         MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
         MFI->setScratchWaveOffsetReg(Reg);
         ScratchWaveOffsetReg = Reg;
-      } else {
-        if (SPReg == AMDGPU::NoRegister)
-          break;
-
-        MRI.replaceRegWith(SPReg, Reg);
-        MFI->setStackPtrOffsetReg(Reg);
-        SPReg = Reg;
         break;
       }
     }
@@ -223,8 +214,8 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   return std::make_pair(ScratchWaveOffsetReg, SPReg);
 }
 
-void SIFrameLowering::emitPrologue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
+void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
+                                                MachineBasicBlock &MBB) const {
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -424,6 +415,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI->isEntryFunction())
+    emitEntryFunctionPrologue(MF, MBB);
+}
+
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
 
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 7ccd02b3c86a..e17adbe27361 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -26,6 +26,8 @@ public:
     AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override = default;
 
+  void emitEntryFunctionPrologue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
   void emitPrologue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 286be355bc14..01c1f78e7ca4 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -914,6 +914,55 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
 }
 
+SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+                                              const SDLoc &SL, SDValue Chain,
+                                              const ISD::InputArg &Arg) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (Arg.Flags.isByVal()) {
+    unsigned Size = Arg.Flags.getByValSize();
+    int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
+    return DAG.getFrameIndex(FrameIdx, MVT::i32);
+  }
+
+  unsigned ArgOffset = VA.getLocMemOffset();
+  unsigned ArgSize = VA.getValVT().getStoreSize();
+
+  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
+
+  // Create load nodes to retrieve arguments from the stack.
+  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+  SDValue ArgValue;
+
+  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  MVT MemVT = VA.getValVT();
+
+  switch (VA.getLocInfo()) {
+  default:
+    break;
+  case CCValAssign::BCvt:
+    MemVT = VA.getLocVT();
+    break;
+  case CCValAssign::SExt:
+    ExtType = ISD::SEXTLOAD;
+    break;
+  case CCValAssign::ZExt:
+    ExtType = ISD::ZEXTLOAD;
+    break;
+  case CCValAssign::AExt:
+    ExtType = ISD::EXTLOAD;
+    break;
+  }
+
+  ArgValue = DAG.getExtLoad(
+    ExtType, SL, VA.getLocVT(), Chain, FIN,
+    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+    MemVT);
+  return ArgValue;
+}
+
 static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
                                    CallingConv::ID CallConv,
                                    ArrayRef<ISD::InputArg> Ins,
@@ -1094,10 +1143,12 @@ static void allocateSystemSGPRs(CCState &CCInfo,
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
                                      MachineFunction &MF,
                                      const SIRegisterInfo &TRI,
-                                     SIMachineFunctionInfo &Info) {
+                                     SIMachineFunctionInfo &Info,
+                                     bool NeedSP) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-  bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool HasStackObjects = MFI.hasStackObjects();
 
   // Record that we know we have non-spill stack objects so we don't need to
   // check all stack objects later.
@@ -1155,6 +1206,15 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
+
+  if (NeedSP){
+    unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
+    Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+    assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
+    assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
+                              Info.getStackPtrOffsetReg()));
+  }
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -1223,8 +1283,10 @@ SDValue SITargetLowering::LowerFormalArguments(
            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
            !Info->hasWorkItemIDZ());
+  } else if (IsKernel) {
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
-    assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
+    Splits.append(Ins.begin(), Ins.end());
   }
 
   if (IsEntryFunc) {
@@ -1278,11 +1340,14 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       InVals.push_back(Arg);
       continue;
+    } else if (!IsEntryFunc && VA.isMemLoc()) {
+      SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
+      InVals.push_back(Val);
+      if (!Arg.Flags.isByVal())
+        Chains.push_back(Val.getValue(1));
+      continue;
     }
 
-    if (VA.isMemLoc())
-      report_fatal_error("memloc not supported with calling convention");
-
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
     unsigned Reg = VA.getLocReg();
@@ -1291,7 +1356,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
-    if (Arg.VT.isVector()) {
+    if (IsShader && Arg.VT.isVector()) {
       // Build a vector from the registers
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
@@ -1317,16 +1382,49 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+  // TODO: Could maybe omit SP if only tail calls?
+  bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
+
   // Start adding system SGPRs.
-  if (IsEntryFunc)
+  if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
-
-  reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
+  } else {
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+    CCInfo.AllocateReg(Info->getFrameOffsetReg());
+
+    if (NeedSP) {
+      unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
+      CCInfo.AllocateReg(StackPtrReg);
+      Info->setStackPtrOffsetReg(StackPtrReg);
+    }
+  }
 
   return Chains.empty() ? Chain :
     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
+// TODO: If return values can't fit in registers, we should return as many as
+// possible in registers before passing on stack.
+bool SITargetLowering::CanLowerReturn(
+  CallingConv::ID CallConv,
+  MachineFunction &MF, bool IsVarArg,
+  const SmallVectorImpl<ISD::OutputArg> &Outs,
+  LLVMContext &Context) const {
+  // Replacing returns with sret/stack usage doesn't make sense for shaders.
+  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
+  // for shaders. Vector types should be explicitly handled by CC.
+  if (AMDGPU::isEntryFunctionCC(CallConv))
+    return true;
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+}
+
 SDValue
 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
@@ -1336,11 +1434,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (!AMDGPU::isShader(CallConv))
+  if (AMDGPU::isKernel(CallConv)) {
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
+  }
+
+  bool IsShader = AMDGPU::isShader(CallConv);
 
   Info->setIfReturnsVoid(Outs.size() == 0);
+  bool IsWaveEnd = Info->returnsVoid() && IsShader;
 
   SmallVector<ISD::OutputArg, 48> Splits;
   SmallVector<SDValue, 48> SplitVals;
@@ -1349,7 +1451,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     const ISD::OutputArg &Out = Outs[i];
 
-    if (Out.VT.isVector()) {
+    if (IsShader && Out.VT.isVector()) {
       MVT VT = Out.VT.getVectorElementType();
       ISD::OutputArg NewOut = Out;
       NewOut.Flags.setSplit();
@@ -1380,29 +1482,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                  *DAG.getContext());
 
   // Analyze outgoing return values.
-  AnalyzeReturn(CCInfo, Splits);
+  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
 
   SDValue Flag;
   SmallVector<SDValue, 48> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
 
+  // Add return address for callable functions.
+  if (!Info->isEntryFunction()) {
+    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+    SDValue ReturnAddrReg = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+    // FIXME: Should be able to use a vreg here, but need a way to prevent it
+    // from being allcoated to a CSR.
+
+    SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+                                                MVT::i64);
+
+    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
+    Flag = Chain.getValue(1);
+
+    RetOps.push_back(PhysReturnAddrReg);
+  }
+
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
+    // TODO: Partially return in registers if return values don't fit.
 
     SDValue Arg = SplitVals[realRVLocIdx];
 
     // Copied from other backends.
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
     }
 
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
@@ -1410,12 +1541,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  // FIXME: Does sret work properly?
+
   // Update chain and glue.
   RetOps[0] = Chain;
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
+  unsigned Opc = AMDGPUISD::ENDPGM;
+  if (!IsWaveEnd)
+    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -2660,6 +2795,15 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
 
+  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+
+  // Make sure we we do any optimizations that will make it easier to fold
+  // source modifiers before obscuring it with bit operations.
+
+  // XXX - Why doesn't this get called when vector_shuffle is expanded?
+  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
+    return Combined;
+
   if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
     SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 046e677756d1..e68837747491 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                    uint64_t Offset, bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
 
+  SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+                              const SDLoc &SL, SDValue Chain,
+                              const ISD::InputArg &Arg) const;
+
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -177,7 +181,12 @@ public:
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv,
+                      MachineFunction &MF, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index b83a1fe187eb..02c9b4b1f0ee 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -228,10 +228,10 @@ class EXPe : Enc64 {
   bits<1> compr;
   bits<1> done;
   bits<1> vm;
-  bits<8> vsrc0;
-  bits<8> vsrc1;
-  bits<8> vsrc2;
-  bits<8> vsrc3;
+  bits<8> src0;
+  bits<8> src1;
+  bits<8> src2;
+  bits<8> src3;
 
   let Inst{3-0} = en;
   let Inst{9-4} = tgt;
@@ -239,10 +239,10 @@ class EXPe : Enc64 {
   let Inst{11} = done;
   let Inst{12} = vm;
   let Inst{31-26} = 0x3e;
-  let Inst{39-32} = vsrc0;
-  let Inst{47-40} = vsrc1;
-  let Inst{55-48} = vsrc2;
-  let Inst{63-56} = vsrc3;
+  let Inst{39-32} = src0;
+  let Inst{47-40} = src1;
+  let Inst{55-48} = src2;
+  let Inst{63-56} = src3;
 }
 
 let Uses = [EXEC] in {
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 933a16646746..c6ad61a325cc 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -97,9 +97,7 @@ private:
 public:
   static char ID;
 
-  SILoadStoreOptimizer() : MachineFunctionPass(ID) {}
-
-  SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
+  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -129,8 +127,8 @@ char SILoadStoreOptimizer::ID = 0;
 
 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
 
-FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
-  return new SILoadStoreOptimizer(TM);
+FunctionPass *llvm::createSILoadStoreOptimizerPass() {
+  return new SILoadStoreOptimizer();
 }
 
 static void moveInstsAfter(MachineBasicBlock::iterator I,
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index adebb8c4a1c5..18b197ddb7ae 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -80,17 +80,22 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
   WavesPerEU = ST.getWavesPerEU(*F);
 
-  // Non-entry functions have no special inputs for now.
-  // TODO: Return early for non-entry CCs.
+  if (!isEntryFunction()) {
+    // Non-entry functions have no special inputs for now, other registers
+    // required for scratch access.
+    ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+    ScratchWaveOffsetReg = AMDGPU::SGPR4;
+    FrameOffsetReg = AMDGPU::SGPR5;
+    return;
+  }
 
   CallingConv::ID CC = F->getCallingConv();
-  if (CC == CallingConv::AMDGPU_PS)
-    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
-
-  if (AMDGPU::isKernel(CC)) {
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
     KernargSegmentPtr = true;
     WorkGroupIDX = true;
     WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
   }
 
   if (ST.debuggerEmitPrologue()) {
@@ -120,7 +125,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
-  bool HasStackObjects = FrameInfo.hasStackObjects();
+  bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
 
   if (HasStackObjects || MaySpill) {
     PrivateSegmentWaveByteOffset = true;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc9f509e60ae..348bb4fa0260 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -388,9 +388,8 @@ public:
   void setScratchWaveOffsetReg(unsigned Reg) {
     assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     ScratchWaveOffsetReg = Reg;
-
-    // FIXME: Only for entry functions.
-    FrameOffsetReg = ScratchWaveOffsetReg;
+    if (isEntryFunction())
+      FrameOffsetReg = ScratchWaveOffsetReg;
   }
 
   unsigned getQueuePtrUserSGPR() const {
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index e02c2e3240e8..4dc090d9b7ed 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include <unordered_map>
+#include <unordered_set>
 
 using namespace llvm;
 
@@ -44,26 +45,29 @@ namespace {
 class SDWAOperand;
 
 class SIPeepholeSDWA : public MachineFunctionPass {
+public:
+  typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;
+
 private:
   MachineRegisterInfo *MRI;
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
 
   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
 
 public:
   static char ID;
 
-  typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
-
   SIPeepholeSDWA() : MachineFunctionPass(ID) {
     initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   void matchSDWAOperands(MachineFunction &MF);
+  bool isConvertibleToSDWA(const MachineInstr &MI) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
 
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
@@ -468,7 +472,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
 
         if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
           auto SDWADst =
-              make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+            make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
           DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
           SDWAOperands[&MI] = std::move(SDWADst);
           ++NumSDWAPatternsFound;
@@ -575,8 +579,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
   }
 }
 
-bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
-                                   const SDWAOperandsVector &SDWAOperands) {
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
   // Check if this instruction can be converted to SDWA:
   // 1. Does this opcode support SDWA
   if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
@@ -588,6 +591,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
       return false;
   }
 
+  return true;
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+                                   const SDWAOperandsVector &SDWAOperands) {
   // Convert to sdwa
   int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
   assert(SDWAOpcode != -1);
@@ -664,7 +672,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   // Apply all sdwa operand pattenrs
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
-    Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+    // There should be no intesection between SDWA operands and potential MIs
+    // e.g.:
+    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
+    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
+    // v_add_u32 v3, v4, v2
+    //
+    // In that example it is possible that we would fold 2nd instruction into 3rd
+    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
+    // already destroyed). So if SDWAOperand is also a potential MI then do not
+    // apply it.
+    if (PotentialMatches.count(Operand->getParentInst()) == 0)
+      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
   }
   if (!Converted) {
     SDWAInst->eraseFromParent();
@@ -690,16 +709,15 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
-
-  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
-
+  
+  // Find all SDWA operands in MF.
   matchSDWAOperands(MF);
 
-  for (auto &OperandPair : SDWAOperands) {
-    auto &Operand = OperandPair.second;
+  for (const auto &OperandPair : SDWAOperands) {
+    const auto &Operand = OperandPair.second;
     MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
-    if (PotentialMI) {
-      PotentialMatches[PotentialMI].push_back(std::move(Operand));
+    if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
+      PotentialMatches[PotentialMI].push_back(Operand.get());
     }
   }
 
@@ -708,6 +726,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
     convertToSDWA(PotentialMI, PotentialPair.second);
   }
 
+  PotentialMatches.clear();
   SDWAOperands.clear();
   return false;
 }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 06cfc95be96a..6fb01a09fe13 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -117,11 +117,7 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
-  const MachineFunction &MF) const {
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  unsigned RegCount = ST.getMaxNumSGPRs(MF);
+static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
   unsigned Reg;
 
   // Try to place it in a hole after PrivateSegmentBufferReg.
@@ -134,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
     // wave offset before it.
     Reg = RegCount - 5;
   }
+
+  return Reg;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
 
+unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
+  const MachineFunction &MF) const {
+  return AMDGPU::SGPR32;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -198,15 +207,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+  if (StackPtrReg != AMDGPU::NoRegister) {
+    reserveRegisterTuples(Reserved, StackPtrReg);
+    assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
+  }
+
+  unsigned FrameReg = MFI->getFrameOffsetReg();
+  if (FrameReg != AMDGPU::NoRegister) {
+    reserveRegisterTuples(Reserved, FrameReg);
+    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
+  }
+
   return Reserved;
 }
 
 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
-  return Fn.getFrameInfo().hasStackObjects();
+  const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
+  if (Info->isEntryFunction()) {
+    const MachineFrameInfo &MFI = Fn.getFrameInfo();
+    return MFI.hasStackObjects() || MFI.hasCalls();
+  }
+
+  // May need scavenger for dealing with callee saved registers.
+  return true;
 }
 
-bool
-SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return MF.getFrameInfo().hasStackObjects();
 }
 
@@ -318,8 +345,11 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 
   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
-
   assert(TII->isMUBUF(MI));
+  assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
+         MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
+         "should only be seeing frame offset relative FrameIndex");
+
 
   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
   int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -981,12 +1011,72 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     }
 
     default: {
-      if (TII->isMUBUF(*MI)) {
+      const DebugLoc &DL = MI->getDebugLoc();
+      bool IsMUBUF = TII->isMUBUF(*MI);
+
+      if (!IsMUBUF &&
+          MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) {
+        // Convert to an absolute stack address by finding the offset from the
+        // scratch wave base and scaling by the wave size.
+        //
+        // In an entry function/kernel the stack address is already the absolute
+        // address relative to the the scratch wave offset.
+
+        unsigned DiffReg
+          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
+        unsigned ResultReg = IsCopy ?
+          MI->getOperand(0).getReg() :
+          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
+          .addReg(MFI->getFrameOffsetReg())
+          .addReg(MFI->getScratchWaveOffsetReg());
+
+        int64_t Offset = FrameInfo.getObjectOffset(Index);
+        if (Offset == 0) {
+          // XXX - This never happens because of emergency scavenging slot at 0?
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
+            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addReg(DiffReg);
+        } else {
+          unsigned CarryOut
+            = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+          unsigned ScaledReg
+            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+          // XXX - Should this use a vector shift?
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+            .addReg(DiffReg, RegState::Kill)
+            .addImm(Log2_32(ST.getWavefrontSize()));
+
+          // TODO: Fold if use instruction is another add of a constant.
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+            .addReg(CarryOut, RegState::Define | RegState::Dead)
+            .addImm(Offset)
+            .addReg(ScaledReg, RegState::Kill);
+
+          MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
+        }
+
+        // Don't introduce an extra copy if we're just materializing in a mov.
+        if (IsCopy)
+          MI->eraseFromParent();
+        else
+          FIOp.ChangeToRegister(ResultReg, false, false, true);
+        return;
+      }
+
+      if (IsMUBUF) {
         // Disable offen so we don't need a 0 vgpr base.
         assert(static_cast<int>(FIOperandNum) ==
                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::vaddr));
 
+        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
+               == MFI->getFrameOffsetReg());
+
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         int64_t OldImm
           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
@@ -995,17 +1085,19 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (isUInt<12>(NewOffset) &&
             buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
           MI->eraseFromParent();
-          break;
+          return;
         }
       }
 
+      // If the offset is simply too big, don't convert to a scratch wave offset
+      // relative index.
+
       int64_t Offset = FrameInfo.getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
         unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(*MBB, MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
-                .addImm(Offset);
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+          .addImm(Offset);
         FIOp.ChangeToRegister(TmpReg, false, false, true);
       }
     }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 679ed229758a..b91cdddc5520 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -17,6 +17,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
@@ -57,8 +58,16 @@ public:
   unsigned reservedPrivateSegmentWaveByteOffsetReg(
     const MachineFunction &MF) const;
 
+  unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -228,6 +237,11 @@ public:
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
+  unsigned getReturnAddressReg(const MachineFunction &MF) const {
+    // Not a callee saved register.
+    return AMDGPU::SGPR30_SGPR31;
+  }
+
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
                            unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 593439c2a3cd..f2d8b6f7b7a4 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -186,11 +186,23 @@ def S_BITSET1_B32 : SOP1_32    <"s_bitset1_b32">;
 def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
 def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
 
-let isTerminator = 1, isBarrier = 1,
-    isBranch = 1, isIndirectBranch = 1 in {
+let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
+
+let isBranch = 1, isIndirectBranch = 1 in {
 def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
+} // End isBranch = 1, isIndirectBranch = 1
+
+let isReturn = 1 in {
+// Define variant marked as return rather than branch.
+def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
+}
+} // End isTerminator = 1, isBarrier = 1
+
+let isCall = 1 in {
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
+>;
 }
-def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+
 def S_RFE_B64 : SOP1_1  <"s_rfe_b64">;
 
 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index d565c84bfeda..2abd4afad3b6 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -518,7 +518,18 @@ bool isCompute(CallingConv::ID cc) {
 }
 
 bool isEntryFunctionCC(CallingConv::ID CC) {
-  return true;
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool isSI(const MCSubtargetInfo &STI) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d6c836eb748b..8e74aa2cc9a8 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -262,7 +262,6 @@ bool isEntryFunctionCC(CallingConv::ID CC);
 LLVM_READNONE
 inline bool isKernel(CallingConv::ID CC) {
   switch (CC) {
-  case CallingConv::C:
   case CallingConv::AMDGPU_KERNEL:
   case CallingConv::SPIR_KERNEL:
     return true;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 5583d6148b08..1979cbf50125 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -471,7 +471,7 @@ void ARMPassConfig::addIRPasses() {
   if (TM->Options.ThreadModel == ThreadModel::Single)
     addPass(createLowerAtomicPass());
   else
-    addPass(createAtomicExpandPass(TM));
+    addPass(createAtomicExpandPass());
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -486,7 +486,7 @@ void ARMPassConfig::addIRPasses() {
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createInterleavedAccessPass(TM));
+    addPass(createInterleavedAccessPass());
 }
 
 bool ARMPassConfig::addPreISel() {
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index e4df7ff5c200..e6ea67d55b43 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -399,7 +399,7 @@ void Simplifier::Context::cleanup() {
   for (Value *V : Clones) {
     Instruction *U = cast<Instruction>(V);
     if (!U->getParent())
-      delete U;
+      U->deleteValue();
   }
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 6913d50bbcaa..8e93df6201ae 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -252,7 +252,7 @@ void HexagonPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
-  addPass(createAtomicExpandPass(TM));
+  addPass(createAtomicExpandPass());
   if (!NoOpt) {
     if (EnableLoopPrefetch)
       addPass(createLoopDataPrefetchPass());
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 7553f3972f5d..008b9505ee26 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -23,14 +23,14 @@ namespace llvm {
   class ModulePass;
   class FunctionPass;
 
-  ModulePass *createMipsOs16Pass(MipsTargetMachine &TM);
-  ModulePass *createMips16HardFloatPass(MipsTargetMachine &TM);
+  ModulePass *createMipsOs16Pass();
+  ModulePass *createMips16HardFloatPass();
 
-  FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsModuleISelDagPass();
+  FunctionPass *createMipsOptimizePICCallPass();
+  FunctionPass *createMipsDelaySlotFillerPass();
   FunctionPass *createMipsHazardSchedule();
-  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsLongBranchPass();
   FunctionPass *createMipsConstantIslandPass();
   FunctionPass *createMicroMipsSizeReductionPass();
 } // end namespace llvm;
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 5a394fe02f16..3c2426129e49 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsTargetMachine.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
@@ -28,14 +29,16 @@ namespace {
   public:
     static char ID;
 
-    Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {}
+    Mips16HardFloat() : ModulePass(ID) {}
 
     StringRef getPassName() const override { return "MIPS16 Hard Float Pass"; }
 
-    bool runOnModule(Module &M) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetPassConfig>();
+      ModulePass::getAnalysisUsage(AU);
+    }
 
-  protected:
-    const MipsTargetMachine &TM;
+    bool runOnModule(Module &M) override;
   };
 
   static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
@@ -520,6 +523,8 @@ static void removeUseSoftFloat(Function &F) {
 //       during call lowering but it should be moved here in the future.
 //
 bool Mips16HardFloat::runOnModule(Module &M) {
+  auto &TM = static_cast<const MipsTargetMachine &>(
+      getAnalysis<TargetPassConfig>().getTM<TargetMachine>());
   DEBUG(errs() << "Run on Module Mips16HardFloat\n");
   bool Modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
@@ -541,6 +546,6 @@ bool Mips16HardFloat::runOnModule(Module &M) {
 }
 
 
-ModulePass *llvm::createMips16HardFloatPass(MipsTargetMachine &TM) {
-  return new Mips16HardFloat(TM);
+ModulePass *llvm::createMips16HardFloatPass() {
+  return new Mips16HardFloat();
 }
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 1597057ad63f..5d82571ff94f 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -211,12 +211,12 @@ namespace {
 
   class Filler : public MachineFunctionPass {
   public:
-    Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm) { }
+    Filler() : MachineFunctionPass(ID), TM(nullptr) {}
 
     StringRef getPassName() const override { return "Mips Delay Slot Filler"; }
 
     bool runOnMachineFunction(MachineFunction &F) override {
+      TM = &F.getTarget();
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -290,7 +290,7 @@ namespace {
 
     bool terminateSearch(const MachineInstr &Candidate) const;
 
-    TargetMachine &TM;
+    const TargetMachine *TM;
 
     static char ID;
   };
@@ -610,7 +610,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     Changed = true;
 
     // Delay slot filling is disabled at -O0.
-    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+    if (!DisableDelaySlotFiller && (TM->getOptLevel() != CodeGenOpt::None)) {
       bool Filled = false;
 
       if (MipsCompactBranchPolicy.getValue() != CB_Always ||
@@ -910,6 +910,4 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const {
 
 /// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
 /// slots in Mips MachineFunctions
-FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
-  return new Filler(tm);
-}
+FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new Filler(); }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 78bae6954c3c..3641a70d61b5 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -795,7 +795,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
   uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
-  ConstantSDNode *CN;
+  ConstantSDNode *CN, *CN1;
 
   // See if Op's first operand matches (and $src1 , mask0).
   if (And0.getOpcode() != ISD::AND)
@@ -806,37 +806,74 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // See if Op's second operand matches (and (shl $src, pos), mask1).
-  if (And1.getOpcode() != ISD::AND)
-    return SDValue();
+  if (And1.getOpcode() == ISD::AND &&
+      And1.getOperand(0).getOpcode() == ISD::SHL) {
 
-  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
-      !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
-    return SDValue();
+    if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+        !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+      return SDValue();
 
-  // The shift masks must have the same position and size.
-  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
-    return SDValue();
+      // The shift masks must have the same position and size.
+      if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+        return SDValue();
 
-  SDValue Shl = And1.getOperand(0);
-  if (Shl.getOpcode() != ISD::SHL)
-    return SDValue();
+      SDValue Shl = And1.getOperand(0);
 
-  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
-    return SDValue();
+      if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+        return SDValue();
 
-  unsigned Shamt = CN->getZExtValue();
+      unsigned Shamt = CN->getZExtValue();
 
-  // Return if the shift amount and the first bit position of mask are not the
-  // same.
-  EVT ValTy = N->getValueType(0);
-  if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
-    return SDValue();
+      // Return if the shift amount and the first bit position of mask are not the
+      // same.
+      EVT ValTy = N->getValueType(0);
+      if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+        return SDValue();
 
-  SDLoc DL(N);
-  return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
-                     DAG.getConstant(SMPos0, DL, MVT::i32),
-                     DAG.getConstant(SMSize0, DL, MVT::i32),
-                     And0.getOperand(0));
+      SDLoc DL(N);
+      return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
+                         DAG.getConstant(SMPos0, DL, MVT::i32),
+                         DAG.getConstant(SMSize0, DL, MVT::i32),
+                         And0.getOperand(0));
+  } else {
+    // Pattern match DINS.
+    //  $dst = or (and $src, mask0), mask1
+    //  where mask0 = ((1 << SMSize0) -1) << SMPos0
+    //  => dins $dst, $src, pos, size
+    if (~CN->getSExtValue() == ((((int64_t)1 << SMSize0) - 1) << SMPos0) &&
+        ((SMSize0 + SMPos0 <= 64 && Subtarget.hasMips64r2()) ||
+         (SMSize0 + SMPos0 <= 32))) {
+      // Check if AND instruction has constant as argument
+      bool isConstCase = And1.getOpcode() != ISD::AND;
+      if (And1.getOpcode() == ISD::AND) {
+        if (!(CN1 = dyn_cast<ConstantSDNode>(And1->getOperand(1))))
+          return SDValue();
+      } else {
+        if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1))))
+          return SDValue();
+      }
+      SDLoc DL(N);
+      EVT ValTy = N->getOperand(0)->getValueType(0);
+      SDValue Const1;
+      SDValue SrlX;
+      if (!isConstCase) {
+        Const1 = DAG.getConstant(SMPos0, DL, MVT::i32);
+        SrlX = DAG.getNode(ISD::SRL, DL, And1->getValueType(0), And1, Const1);
+      }
+      return DAG.getNode(
+          MipsISD::Ins, DL, N->getValueType(0),
+          isConstCase
+              ? DAG.getConstant(CN1->getSExtValue() >> SMPos0, DL, ValTy)
+              : SrlX,
+          DAG.getConstant(SMPos0, DL, MVT::i32),
+          DAG.getConstant(ValTy.getSizeInBits() / 8 < 8 ? SMSize0 & 31
+                                                        : SMSize0,
+                          DL, MVT::i32),
+          And0->getOperand(0));
+
+    }
+    return SDValue();
+  }
 }
 
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 100503700a72..b95f1158fa56 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -75,9 +75,8 @@ namespace {
   public:
     static char ID;
 
-    MipsLongBranch(TargetMachine &tm)
-        : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
-          ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
+    MipsLongBranch()
+        : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {}
 
     StringRef getPassName() const override { return "Mips Long Branch"; }
 
@@ -96,7 +95,6 @@ namespace {
                        MachineBasicBlock *MBBOpnd);
     void expandToLongBranch(MBBInfo &Info);
 
-    const TargetMachine &TM;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
     bool IsPIC;
@@ -469,6 +467,12 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       static_cast<const MipsSubtarget &>(F.getSubtarget());
   const MipsInstrInfo *TII =
       static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+
+
+  const TargetMachine& TM = F.getTarget();
+  IsPIC = TM.isPositionIndependent();
+  ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+
   LongBranchSeqSize =
       !IsPIC ? 2 : (ABI.IsN64() ? 10 : (!STI.isTargetNaCl() ? 9 : 10));
 
@@ -541,6 +545,4 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
 
 /// createMipsLongBranchPass - Returns a pass that converts branches to long
 /// branches.
-FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
-  return new MipsLongBranch(tm);
-}
+FunctionPass *llvm::createMipsLongBranchPass() { return new MipsLongBranch(); }
diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index cf85eb3f2416..ceacaa498389 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
 
 #include "Mips.h"
 #include "MipsTargetMachine.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -22,18 +23,19 @@ namespace {
   public:
     static char ID;
 
-    explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_)
-      : MachineFunctionPass(ID), TM(TM_) {}
+    MipsModuleDAGToDAGISel() : MachineFunctionPass(ID) {}
 
     // Pass Name
     StringRef getPassName() const override {
       return "MIPS DAG->DAG Pattern Instruction Selection";
     }
 
-    bool runOnMachineFunction(MachineFunction &MF) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetPassConfig>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
 
-  protected:
-    MipsTargetMachine &TM;
+    bool runOnMachineFunction(MachineFunction &MF) override;
   };
 
   char MipsModuleDAGToDAGISel::ID = 0;
@@ -41,10 +43,12 @@ namespace {
 
 bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<MipsTargetMachine>();
   TM.resetSubtarget(&MF);
   return false;
 }
 
-llvm::FunctionPass *llvm::createMipsModuleISelDagPass(MipsTargetMachine &TM) {
-  return new MipsModuleDAGToDAGISel(TM);
+llvm::FunctionPass *llvm::createMipsModuleISelDagPass() {
+  return new MipsModuleDAGToDAGISel();
 }
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index f8d9c34556bc..94a1965f9ffb 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -59,7 +59,7 @@ private:
 
 class OptimizePICCall : public MachineFunctionPass {
 public:
-  OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
+  OptimizePICCall() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "Mips OptimizePICCall"; }
 
@@ -297,6 +297,6 @@ void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
 }
 
 /// Return an OptimizeCall object.
-FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) {
-  return new OptimizePICCall(TM);
+FunctionPass *llvm::createMipsOptimizePICCallPass() {
+  return new OptimizePICCall();
 }
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 670b6c96e78e..70ead5cde6fa 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -155,6 +155,4 @@ bool MipsOs16::runOnModule(Module &M) {
   return modified;
 }
 
-ModulePass *llvm::createMipsOs16Pass(MipsTargetMachine &TM) {
-  return new MipsOs16;
-}
+ModulePass *llvm::createMipsOs16Pass() { return new MipsOs16(); }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 29a38fd35c1f..092de216e9b8 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -154,6 +154,11 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
   bool hasNoMips16Attr =
       !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
 
+  bool HasMicroMipsAttr =
+      !F.getFnAttribute("micromips").hasAttribute(Attribute::None);
+  bool HasNoMicroMipsAttr =
+      !F.getFnAttribute("nomicromips").hasAttribute(Attribute::None);
+
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
   // function, so we can enable it as a subtarget feature.
@@ -165,6 +170,10 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
     FS += FS.empty() ? "+mips16" : ",+mips16";
   else if (hasNoMips16Attr)
     FS += FS.empty() ? "-mips16" : ",-mips16";
+  if (HasMicroMipsAttr)
+    FS += FS.empty() ? "+micromips" : ",+micromips";
+  else if (HasNoMicroMipsAttr)
+    FS += FS.empty() ? "-micromips" : ",-micromips";
   if (softFloat)
     FS += FS.empty() ? "+soft-float" : ",+soft-float";
 
@@ -223,23 +232,23 @@ TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
-  addPass(createAtomicExpandPass(&getMipsTargetMachine()));
+  addPass(createAtomicExpandPass());
   if (getMipsSubtarget().os16())
-    addPass(createMipsOs16Pass(getMipsTargetMachine()));
+    addPass(createMipsOs16Pass());
   if (getMipsSubtarget().inMips16HardFloat())
-    addPass(createMips16HardFloatPass(getMipsTargetMachine()));
+    addPass(createMips16HardFloatPass());
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
 bool MipsPassConfig::addInstSelector() {
-  addPass(createMipsModuleISelDagPass(getMipsTargetMachine()));
+  addPass(createMipsModuleISelDagPass());
   addPass(createMips16ISelDag(getMipsTargetMachine(), getOptLevel()));
   addPass(createMipsSEISelDag(getMipsTargetMachine(), getOptLevel()));
   return false;
 }
 
 void MipsPassConfig::addPreRegAlloc() {
-  addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+  addPass(createMipsOptimizePICCallPass());
 }
 
 TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
@@ -259,15 +268,14 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 void MipsPassConfig::addPreEmitPass() {
-  MipsTargetMachine &TM = getMipsTargetMachine();
   addPass(createMicroMipsSizeReductionPass());
 
   // The delay slot filler pass can potientially create forbidden slot (FS)
   // hazards for MIPSR6 which the hazard schedule pass (HSP) will fix. Any
   // (new) pass that creates compact branches after the HSP must handle FS
   // hazards itself or be pipelined before the HSP.
-  addPass(createMipsDelaySlotFillerPass(TM));
+  addPass(createMipsDelaySlotFillerPass());
   addPass(createMipsHazardSchedule());
-  addPass(createMipsLongBranchPass(TM));
+  addPass(createMipsLongBranchPass());
   addPass(createMipsConstantIslandPass());
 }
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 144aea850833..e65b1f1aa0a5 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -689,6 +689,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setOperationAction(ISD::SRA, MVT::v2i64, Legal);
         setOperationAction(ISD::SRL, MVT::v2i64, Legal);
 
+        // 128 bit shifts can be accomplished via 3 instructions for SHL and
+        // SRL, but not for SRA because of the instructions available:
+        // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
+        // doing
+        setOperationAction(ISD::SHL, MVT::v1i128, Expand);
+        setOperationAction(ISD::SRL, MVT::v1i128, Expand);
+        setOperationAction(ISD::SRA, MVT::v1i128, Expand);
+
         setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
       }
       else {
@@ -742,6 +750,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     if (Subtarget.hasP9Vector()) {
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+      // 128 bit shifts can be accomplished via 3 instructions for SHL and
+      // SRL, but not for SRA because of the instructions available:
+      // VS{RL} and VS{RL}O.
+      setOperationAction(ISD::SHL, MVT::v1i128, Legal);
+      setOperationAction(ISD::SRL, MVT::v1i128, Legal);
+      setOperationAction(ISD::SRA, MVT::v1i128, Expand);
     }
   }
 
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index e14d18fd5433..5465b5f2d66c 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -987,12 +987,16 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSLH $vA, $vB))>;
 def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
 def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSLB $vA, $vB))>;
 def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSLH $vA, $vB))>;
 def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
 
 def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
@@ -1000,12 +1004,16 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRH $vA, $vB))>;
 def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
 def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
 def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRH $vA, $vB))>;
 def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
 
 def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRAB $vA, $vB))>;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 3afcec1248d5..46f103141bc1 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1533,6 +1533,8 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case PPC::FCMPUD:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
+    Value = 0;
+    Mask = 0;
     return true;
   }
 }
@@ -1591,9 +1593,12 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
 
       // We can perform this optimization, equality only, if MI is
       // zero-extending.
+      // FIXME: Other possible target instructions include ANDISo and
+      //        RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI.
       if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
           MIOpC == PPC::SLW    || MIOpC == PPC::SLWo ||
           MIOpC == PPC::SRW    || MIOpC == PPC::SRWo ||
+          MIOpC == PPC::ANDIo  ||
           isZeroExtendingRotate) {
         noSub = true;
         equalityOnly = true;
@@ -1641,6 +1646,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       break;
   }
 
+  SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
+  SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
+
   // There are two possible candidates which can be changed to set CR[01].
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
@@ -1652,9 +1660,37 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   // same BB as the comparison. This is to allow the check below to avoid calls
   // (and other explicit clobbers); instead we should really check for these
   // more explicitly (in at least a few predecessors).
-  else if (MI->getParent() != CmpInstr.getParent() || Value != 0) {
-    // PPC does not have a record-form SUBri.
+  else if (MI->getParent() != CmpInstr.getParent())
     return false;
+  else if (Value != 0) {
+    // The record-form instructions set CR bit based on signed comparison against 0.
+    // We try to convert a compare against 1 or -1 into a compare against 0.
+    bool Success = false;
+    if (!equalityOnly && MRI->hasOneUse(CRReg)) {
+      MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg);
+      if (UseMI->getOpcode() == PPC::BCC) {
+        PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
+        int16_t Immed = (int16_t)Value;
+
+        if (Immed == -1 && Pred == PPC::PRED_GT) {
+          // We convert "greater than -1" into "greater than or equal to 0",
+          // since we are assuming signed comparison by !equalityOnly
+          PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                  PPC::PRED_GE));
+          Success = true;
+        }
+        else if (Immed == 1 && Pred == PPC::PRED_LT) {
+          // We convert "less than 1" into "less than or equal to 0".
+          PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                  PPC::PRED_LE));
+          Success = true;
+        }
+      }
+    }
+
+    // PPC does not have a record-form SUBri.
+    if (!Success)
+      return false;
   }
 
   // Search for Sub.
@@ -1720,15 +1756,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (NewOpC == -1)
     return false;
 
-  SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
-  SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
-
   // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
   // needs to be updated to be based on SUB.  Push the condition code
   // operands to OperandsToUpdate.  If it is safe to remove CmpInstr, the
   // condition code of these operands will be modified.
+  // Here, Value == 0 means we haven't converted comparison against 1 or -1 to
+  // comparison against 0, which may modify predicate.
   bool ShouldSwap = false;
-  if (Sub) {
+  if (Sub && Value == 0) {
     ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
       Sub->getOperand(2).getReg() == SrcReg;
 
@@ -1765,6 +1800,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       } else // We need to abort on a user we don't understand.
         return false;
     }
+  assert(!(Value != 0 && ShouldSwap) &&
+         "Non-zero immediate support and ShouldSwap"
+         "may conflict in updating predicate");
 
   // Create a new virtual register to hold the value of the CR set by the
   // record-form instruction. If the instruction was not previously in
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 7806d45b5457..ddae5befee3e 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -322,7 +322,7 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
 void PPCPassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createPPCBoolRetToIntPass());
-  addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+  addPass(createAtomicExpandPass());
 
   // For the BG/Q (or if explicitly requested), add explicit data prefetch
   // intrinsics.
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 6f9cc314e376..df819ccd15db 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -96,7 +96,7 @@ namespace {
 /// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
 /// slots in Sparc MachineFunctions
 ///
-FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+FunctionPass *llvm::createSparcDelaySlotFillerPass() {
   return new Filler;
 }
 
diff --git a/lib/Target/Sparc/LeonPasses.cpp b/lib/Target/Sparc/LeonPasses.cpp
index 0acc2875daa8..ca6a0dc3c2a3 100755
--- a/lib/Target/Sparc/LeonPasses.cpp
+++ b/lib/Target/Sparc/LeonPasses.cpp
@@ -21,9 +21,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-LEONMachineFunctionPass::LEONMachineFunctionPass(TargetMachine &tm, char &ID)
-    : MachineFunctionPass(ID) {}
-
 LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
     : MachineFunctionPass(ID) {}
 
@@ -72,8 +69,7 @@ int LEONMachineFunctionPass::getUnusedFPRegister(MachineRegisterInfo &MRI) {
 //
 char InsertNOPLoad::ID = 0;
 
-InsertNOPLoad::InsertNOPLoad(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+InsertNOPLoad::InsertNOPLoad() : LEONMachineFunctionPass(ID) {}
 
 bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -114,7 +110,7 @@ bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
 //
 char FixFSMULD::ID = 0;
 
-FixFSMULD::FixFSMULD(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+FixFSMULD::FixFSMULD() : LEONMachineFunctionPass(ID) {}
 
 bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -203,8 +199,7 @@ bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
 //
 char ReplaceFMULS::ID = 0;
 
-ReplaceFMULS::ReplaceFMULS(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+ReplaceFMULS::ReplaceFMULS() : LEONMachineFunctionPass(ID) {}
 
 bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -287,8 +282,7 @@ bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
 
 char DetectRoundChange::ID = 0;
 
-DetectRoundChange::DetectRoundChange(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+DetectRoundChange::DetectRoundChange() : LEONMachineFunctionPass(ID) {}
 
 bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -338,8 +332,7 @@ bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
 //
 char FixAllFDIVSQRT::ID = 0;
 
-FixAllFDIVSQRT::FixAllFDIVSQRT(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+FixAllFDIVSQRT::FixAllFDIVSQRT() : LEONMachineFunctionPass(ID) {}
 
 bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
diff --git a/lib/Target/Sparc/LeonPasses.h b/lib/Target/Sparc/LeonPasses.h
index 2158cb636bfc..99cdfc4589ef 100755
--- a/lib/Target/Sparc/LeonPasses.h
+++ b/lib/Target/Sparc/LeonPasses.h
@@ -32,7 +32,6 @@ protected:
   std::vector<int> UsedRegisters;
 
 protected:
-  LEONMachineFunctionPass(TargetMachine &tm, char &ID);
   LEONMachineFunctionPass(char &ID);
 
   int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
@@ -48,7 +47,7 @@ class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  InsertNOPLoad(TargetMachine &tm);
+  InsertNOPLoad();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -62,7 +61,7 @@ class LLVM_LIBRARY_VISIBILITY FixFSMULD : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  FixFSMULD(TargetMachine &tm);
+  FixFSMULD();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -74,7 +73,7 @@ class LLVM_LIBRARY_VISIBILITY ReplaceFMULS : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  ReplaceFMULS(TargetMachine &tm);
+  ReplaceFMULS();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -89,7 +88,7 @@ class LLVM_LIBRARY_VISIBILITY DetectRoundChange
 public:
   static char ID;
 
-  DetectRoundChange(TargetMachine &tm);
+  DetectRoundChange();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -102,7 +101,7 @@ class LLVM_LIBRARY_VISIBILITY FixAllFDIVSQRT : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  FixAllFDIVSQRT(TargetMachine &tm);
+  FixAllFDIVSQRT();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index 0a8272d89297..4135e4e1b61d 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -28,7 +28,7 @@ namespace llvm {
   class MachineInstr;
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
-  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass();
 
   void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
                                       MCInst &OutMI,
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 4ae64062d9e2..1da4d3604304 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -132,7 +132,7 @@ TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void SparcPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass(&getSparcTargetMachine()));
+  addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
 }
@@ -143,26 +143,26 @@ bool SparcPassConfig::addInstSelector() {
 }
 
 void SparcPassConfig::addPreEmitPass(){
-  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  addPass(createSparcDelaySlotFillerPass());
 
   if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPLoad())
   {
-    addPass(new InsertNOPLoad(getSparcTargetMachine()));
+    addPass(new InsertNOPLoad());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->fixFSMULD())
   {
-    addPass(new FixFSMULD(getSparcTargetMachine()));
+    addPass(new FixFSMULD());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->replaceFMULS())
   {
-    addPass(new ReplaceFMULS(getSparcTargetMachine()));
+    addPass(new ReplaceFMULS());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->detectRoundChange()) {
-    addPass(new DetectRoundChange(getSparcTargetMachine()));
+    addPass(new DetectRoundChange());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->fixAllFDIVSQRT())
   {
-    addPass(new FixAllFDIVSQRT(getSparcTargetMachine()));
+    addPass(new FixAllFDIVSQRT());
   }
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 44c794ef5da1..b974681fb6af 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -173,7 +173,7 @@ void WebAssemblyPassConfig::addIRPasses() {
   else
     // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
     // control specifically what gets lowered.
-    addPass(createAtomicExpandPass(TM));
+    addPass(createAtomicExpandPass());
 
   // Fix function bitcasts, as WebAssembly requires caller and callee signatures
   // to match.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 784c3a6557ff..3a421fe77392 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+                                   "LEA instruction with 3 ops or certain registers is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
 def FeatureSoftFloat
@@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
+  FeatureSlow3OpsLEA,
   FeatureFastScalarFSQRT,
   FeatureFastSHLDRotate
 ]>;
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 6781d761a1c4..7d146d050a5c 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -73,8 +73,8 @@ def CC_#NAME : CallingConv<[
     CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
     CCIfByVal<CCPassByVal<4, 4>>,
 
-    // Promote i1/i8/i16 arguments to i32.
-    CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+    // Promote i1/i8/i16/v1i1 arguments to i32.
+    CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
     // Promote v8i1/v16i1/v32i1 arguments to i32.
     CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
@@ -146,8 +146,8 @@ def CC_#NAME : CallingConv<[
 ]>;
 
 def RetCC_#NAME : CallingConv<[
-    // Promote i1, v8i1 arguments to i8.
-    CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
+    // Promote i1, v1i1, v8i1 arguments to i8.
+    CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>,
 
     // Promote v16i1 arguments to i16.
     CCIfType<[v16i1], CCPromoteToType<i16>>,
@@ -207,6 +207,7 @@ def RetCC_X86Common : CallingConv<[
   //
   // For code that doesn't care about the ABI, we allow returning more than two
   // integer values in registers.
+  CCIfType<[v1i1],  CCPromoteToType<i8>>,
   CCIfType<[i1],  CCPromoteToType<i8>>,
   CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
   CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
@@ -375,6 +376,7 @@ def RetCC_X86_64_Swift : CallingConv<[
   CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
 
   // For integers, ECX, R8D can be used as extra return registers.
+  CCIfType<[v1i1],  CCPromoteToType<i8>>,
   CCIfType<[i1],  CCPromoteToType<i8>>,
   CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
   CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
@@ -485,8 +487,8 @@ def CC_X86_64_C : CallingConv<[
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<8, 8>>,
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
@@ -584,8 +586,8 @@ def CC_X86_Win64_C : CallingConv<[
   // FIXME: Handle byval stuff.
   // FIXME: Handle varargs.
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCAssignToReg<[R10]>>,
@@ -796,8 +798,8 @@ def CC_X86_32_Common : CallingConv<[
 ]>;
 
 def CC_X86_32_C : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in ECX.
   CCIfNest<CCAssignToReg<[ECX]>>,
@@ -816,8 +818,8 @@ def CC_X86_32_MCU : CallingConv<[
   // puts arguments in registers.
   CCIfByVal<CCPassByVal<4, 4>>,
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // If the call is not a vararg call, some arguments may be passed
   // in integer registers.
@@ -828,8 +830,8 @@ def CC_X86_32_MCU : CallingConv<[
 ]>;
 
 def CC_X86_32_FastCall : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in EAX.
   CCIfNest<CCAssignToReg<[EAX]>>,
@@ -858,15 +860,15 @@ def CC_X86_32_ThisCall_Common : CallingConv<[
 ]>;
 
 def CC_X86_32_ThisCall_Mingw : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   CCDelegateTo<CC_X86_32_ThisCall_Common>
 ]>;
 
 def CC_X86_32_ThisCall_Win : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // Pass sret arguments indirectly through stack.
   CCIfSRet<CCAssignToStack<4, 4>>,
@@ -885,8 +887,8 @@ def CC_X86_32_FastCC : CallingConv<[
   // puts arguments in registers.
   CCIfByVal<CCPassByVal<4, 4>>,
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in EAX.
   CCIfNest<CCAssignToReg<[EAX]>>,
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index fc3b4836c178..3cfb924abd01 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3647,13 +3647,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
     case MVT::i1:
-      if (Subtarget->hasAVX512()) {
-        // Need to copy to a VK1 register.
-        unsigned ResultReg = createResultReg(&X86::VK1RegClass);
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg);
-        return ResultReg;
-      }
     case MVT::i8:
       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
                                         X86::sub_8bit);
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 2cd4c1a3e7b3..9f649dad8bc0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
 
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
 class FixupLEAPass : public MachineFunctionPass {
   enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-  static char ID;
+
   /// \brief Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
   bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-  StringRef getPassName() const override { return "X86 LEA Fixup"; }
 
   /// \brief Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
   void processInstructionForSLM(MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI);
 
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on SNB+ try to replace it with other instructions.
+  /// According to Intel's Optimization Reference Manual:
+  /// " For LEA instructions with three source operands and some specific
+  ///   situations, instruction latency has increased to 3 cycles, and must
+  ///   dispatch via port 1:
+  /// - LEA that has all three source operands: base, index, and offset
+  /// - LEA that uses base and index registers where the base is EBP, RBP,
+  ///   or R13
+  /// - LEA that uses RIP relative addressing mode
+  /// - LEA that uses 16-bit addressing mode "
+  /// This function currently handles the first 2 cases only.
+  MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+                                          MachineFunction::iterator MFI);
+
   /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
   /// and convert them to INC or DEC respectively.
   bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
                                    MachineBasicBlock::iterator &MBBI) const;
 
 public:
-  FixupLEAPass() : MachineFunctionPass(ID) {}
+  static char ID;
+
+  StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+  FixupLEAPass() : MachineFunctionPass(ID) {
+    initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+  }
 
   /// \brief Loop over all of the basic blocks,
   /// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
   bool OptIncDec;
   bool OptLEA;
 };
-char FixupLEAPass::ID = 0;
 }
 
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
   OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
-  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
 
   if (!OptLEA && !OptIncDec)
     return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
   return MachineBasicBlock::iterator();
 }
 
-static inline bool isLEA(const int opcode) {
-  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
-         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+  return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+  return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+                                            const MachineOperand &Index) {
+  return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+         isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+  return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+                                      const MachineOperand &Index,
+                                      const MachineOperand &Offset) {
+  return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return X86::ADD16rr;
+  case X86::LEA32r:
+    return X86::ADD32rr;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    return X86::ADD64rr;
+  }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+  bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+    return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+  case X86::LEA64r:
+    return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+  }
 }
 
 /// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
 void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
                                             MachineFunction::iterator MFI) {
   MachineInstr &MI = *I;
-  const int opcode = MI.getOpcode();
-  if (!isLEA(opcode))
+  const int Opcode = MI.getOpcode();
+  if (!isLEA(Opcode))
     return;
   if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
       !TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   if (MI.getOperand(2).getImm() > 1)
     return;
-  int addrr_opcode, addri_opcode;
-  switch (opcode) {
-  default:
-    llvm_unreachable("Unexpected LEA instruction");
-  case X86::LEA16r:
-    addrr_opcode = X86::ADD16rr;
-    addri_opcode = X86::ADD16ri;
-    break;
-  case X86::LEA32r:
-    addrr_opcode = X86::ADD32rr;
-    addri_opcode = X86::ADD32ri;
-    break;
-  case X86::LEA64_32r:
-  case X86::LEA64r:
-    addrr_opcode = X86::ADD64rr;
-    addri_opcode = X86::ADD64ri32;
-    break;
-  }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
   MachineInstr *NewMI = nullptr;
-  const MachineOperand &Dst = MI.getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
-    const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
-                .add(Dst)
-                .add(Src1)
-                .add(Src2);
-    MFI->insert(I, NewMI);
+    const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+    const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI =
+        BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
     DEBUG(NewMI->dump(););
   }
   // Make ADD instruction for immediate
   if (MI.getOperand(4).getImm() != 0) {
+    const MCInstrDesc &ADDri =
+        TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
     const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
-                .add(Dst)
+    NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
                 .add(SrcR)
                 .addImm(MI.getOperand(4).getImm());
-    MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
   }
   if (NewMI) {
     MFI->erase(I);
-    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    I = NewMI;
+  }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+                                        MachineFunction::iterator MFI) {
+
+  const int LEAOpcode = MI.getOpcode();
+  if (!isLEA(LEAOpcode))
+    return nullptr;
+
+  const MachineOperand &Dst = MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(1);
+  const MachineOperand &Scale = MI.getOperand(2);
+  const MachineOperand &Index = MI.getOperand(3);
+  const MachineOperand &Offset = MI.getOperand(4);
+  const MachineOperand &Segment = MI.getOperand(5);
+
+  if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+        hasInefficientLEABaseReg(Base, Index)) ||
+      !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+      Segment.getReg() != X86::NoRegister)
+    return nullptr;
+
+  unsigned int DstR = Dst.getReg();
+  unsigned int BaseR = Base.getReg();
+  unsigned int IndexR = Index.getReg();
+  unsigned SSDstR =
+      (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+  bool IsScale1 = Scale.getImm() == 1;
+  bool IsInefficientBase = isInefficientLEAReg(BaseR);
+  bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+  // Skip these cases since it takes more than 2 instructions
+  // to replace the LEA instruction.
+  if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+    return nullptr;
+  if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+      (IsInefficientIndex || !IsScale1))
+    return nullptr;
+
+  const DebugLoc DL = MI.getDebugLoc();
+  const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+  const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+  // First try to replace LEA with one or two (for the 3-op LEA case)
+  // add instructions:
+  // 1.lea (%base,%index,1), %base => add %index,%base
+  // 2.lea (%base,%index,1), %index => add %base,%index
+  if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+    const MachineOperand &Src = DstR == BaseR ? Index : Base;
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // If the base is inefficient try switching the index and base operands,
+  // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+  // lea offset(%base,%index,scale),%dst =>
+  // lea (%base,%index,scale); add offset,%dst
+  if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+    MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                              .add(Dst)
+                              .add(IsInefficientBase ? Index : Base)
+                              .add(Scale)
+                              .add(IsInefficientBase ? Base : Index)
+                              .addImm(0)
+                              .add(Segment);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // Handle the rest of the cases with inefficient base register:
+  assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+  assert(IsInefficientBase && "efficient base should be handled already!");
+
+  // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+  if (IsScale1 && !hasLEAOffset(Offset)) {
+    TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+    DEBUG(MI.getPrevNode()->dump(););
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+    DEBUG(NewMI->dump(););
+    return NewMI;
   }
+  // lea offset(%base,%index,scale), %dst =>
+  // lea offset( ,%index,scale), %dst; add %base,%dst
+  MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                            .add(Dst)
+                            .addReg(0)
+                            .add(Scale)
+                            .add(Index)
+                            .add(Offset)
+                            .add(Segment);
+  DEBUG(NewMI->dump(););
+
+  NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+  DEBUG(NewMI->dump(););
+  return NewMI;
 }
 
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
     if (OptLEA) {
       if (MF.getSubtarget<X86Subtarget>().isSLM())
         processInstructionForSLM(I, MFI);
-      else
-        processInstruction(I, MFI);
+
+      else {
+        if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+          if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+            MFI->erase(I);
+            I = NewMI;
+          }
+        } else
+          processInstruction(I, MFI);
+      }
     }
   }
   return false;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 11c08292518a..37b248416e4a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1140,7 +1140,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
 
-    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
+    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
@@ -1155,16 +1155,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
     }
-    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
-    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
-    setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
-    setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
-    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
-    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
-    setOperationAction(ISD::AND,                MVT::i1,    Legal);
-    setOperationAction(ISD::SUB,                MVT::i1,    Custom);
-    setOperationAction(ISD::ADD,                MVT::i1,    Custom);
-    setOperationAction(ISD::MUL,                MVT::i1,    Custom);
 
     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
@@ -1233,7 +1223,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::MSTORE, VT, Custom);
       }
     }
-    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
 
@@ -1311,7 +1300,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
 
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
@@ -1699,7 +1690,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                           LLVMContext& Context,
                                           EVT VT) const {
   if (!VT.isVector())
-    return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
+    return MVT::i8;
 
   if (VT.isSimple()) {
     MVT VVT = VT.getSimpleVT();
@@ -2480,6 +2471,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
                                SelectionDAG &DAG) {
   SDValue ValReturned = ValArg;
 
+  if (ValVT == MVT::v1i1)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
+
   if (ValVT == MVT::v64i1) {
     // In 32 bit machine, this case is handled by getv64i1Argument
     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
@@ -2502,7 +2496,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
 
     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
   }
-
   return DAG.getBitcast(ValVT, ValReturned);
 }
 
@@ -2809,8 +2802,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   SDValue Val = DAG.getLoad(
       ValVT, dl, Chain, FIN,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-  return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
-                       : Val;
+  return ExtendedInMem
+             ? (VA.getValVT().isVector()
+                    ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
+                    : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
+             : Val;
 }
 
 // FIXME: Get this from tablegen.
@@ -2960,7 +2956,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
         else if (RegVT == MVT::x86mmx)
           RC = &X86::VR64RegClass;
-        else if (RegVT == MVT::i1)
+        else if (RegVT == MVT::v1i1)
           RC = &X86::VK1RegClass;
         else if (RegVT == MVT::v8i1)
           RC = &X86::VK8RegClass;
@@ -6871,7 +6867,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (!In.isUndef())
-      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
   }
   SDLoc dl(Op);
   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
@@ -6914,7 +6910,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     if (!isa<ConstantSDNode>(In))
       NonConstIdx.push_back(idx);
     else {
-      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
       HasConstElts = true;
     }
     if (SplatIdx < 0)
@@ -13946,7 +13942,6 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   SDValue Idx = Op.getOperand(1);
   MVT EltVT = Op.getSimpleValueType();
 
-  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
          "Unexpected vector type in ExtractBitFromMaskVector");
 
@@ -13980,8 +13975,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
                       DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                     DAG.getConstant(MaxSift, dl, MVT::i8));
-  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
-                       DAG.getIntPtrConstant(0, dl));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 SDValue
@@ -13992,7 +13987,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
 
-  if (Op.getSimpleValueType() == MVT::i1)
+  if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG);
 
   if (!isa<ConstantSDNode>(Idx)) {
@@ -14163,10 +14158,13 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
     return EltInVec;
   }
 
-  // Insertion of one bit into first or last position
-  // can be done with two SHIFTs + OR.
+  // Insertion of one bit into first position
   if (IdxVal == 0 ) {
-    // EltInVec already at correct index and other bits are 0.
+    // Clean top bits of vector.
+    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
+                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
+    EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
+                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
     // Clean the first bit in source vector.
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                       DAG.getConstant(1 , dl, MVT::i8));
@@ -14175,6 +14173,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
+  // Insertion of one bit into last position
   if (IdxVal == NumElems -1) {
     // Move the bit to the last position inside the vector.
     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
@@ -17322,8 +17321,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
-  assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
-         && "SetCC type must be 8-bit or 1-bit integer");
+  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
@@ -17457,7 +17455,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     if (SSECC != 8) {
       if (Subtarget.hasAVX512()) {
-        SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
                                   CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
         return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
                            DL, VT, Cmp, Op1, Op2);
@@ -17505,9 +17503,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // AVX512 fallback is to lower selects of scalar floats to masked moves.
-  if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
-      Subtarget.hasAVX512())
-    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+  if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+    SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
+    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+  }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
     SDValue Op1Scalar;
@@ -19048,8 +19047,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
 
 /// \brief Creates an SDNode for a predicated scalar operation.
 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
-/// The mask is coming as MVT::i8 and it should be truncated
-/// to MVT::i1 while lowering masking intrinsics.
+/// The mask is coming as MVT::i8 and it should be transformed
+/// to MVT::v1i1 while lowering masking intrinsics.
 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
 /// "X86select" instead of "vselect". We just can't create the "vselect" node
 /// for a scalar instruction.
@@ -19064,11 +19063,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
-  // The mask should be of type MVT::i1
-  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
 
+  SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
   if (Op.getOpcode() == X86ISD::FSETCCM ||
-      Op.getOpcode() == X86ISD::FSETCCM_RND)
+      Op.getOpcode() == X86ISD::FSETCCM_RND) 
     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
   if (Op.getOpcode() == X86ISD::VFPCLASSS)
     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
@@ -19507,10 +19505,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       SDValue Src1 = Op.getOperand(1);
       SDValue Imm = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
-      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
-      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
+      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
+                         DAG.getIntPtrConstant(0, dl));
     }
     case CMP_MASK:
     case CMP_MASK_CC: {
@@ -19570,18 +19569,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       if (IntrData->Opc1 != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd))
-          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
       }
       //default rounding mode
       if(!Cmp.getNode())
-        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
 
       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
                                              DAG.getTargetConstant(0, dl,
                                                                    MVT::i1),
                                              Subtarget, DAG);
-
-      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
+      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
+                         DAG.getIntPtrConstant(0, dl));
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -19629,13 +19628,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
 
       SDValue FCmp;
       if (isRoundModeCurDirection(Sae))
-        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
-                                  DAG.getConstant(CondVal, dl, MVT::i8));
+        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
+                           DAG.getConstant(CondVal, dl, MVT::i8));
       else
-        FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
-                                  DAG.getConstant(CondVal, dl, MVT::i8), Sae);
-      // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
-      return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
+        FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+                           DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
+                         DAG.getIntPtrConstant(0, dl));
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -23385,8 +23384,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
          "Unexpected request for vector widening");
 
-  EVT EltVT = NVT.getVectorElementType();
-
   SDLoc dl(InOp);
   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
       InOp.getNumOperands() == 2) {
@@ -23404,6 +23401,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
     for (unsigned i = 0; i < InNumElts; ++i)
       Ops.push_back(InOp.getOperand(i));
 
+    EVT EltVT = InOp.getOperand(0).getValueType();
+
     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
       DAG.getUNDEF(EltVT);
     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
@@ -24709,16 +24708,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   //  xbegin sinkMBB
   //
   // mainMBB:
-  //  eax = -1
+  //  s0 = -1
+  //
+  // fallBB:
+  //  eax = # XABORT_DEF
+  //  s1 = eax
   //
   // sinkMBB:
-  //  v = eax
+  //  v = phi(s0/mainBB, s1/fallBB)
 
   MachineBasicBlock *thisMBB = MBB;
   MachineFunction *MF = MBB->getParent();
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
+  MF->insert(I, fallMBB);
   MF->insert(I, sinkMBB);
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -24726,25 +24731,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned fallDstReg = MRI.createVirtualRegister(RC);
+
   // thisMBB:
-  //  xbegin sinkMBB
+  //  xbegin fallMBB
   //  # fallthrough to mainMBB
-  //  # abortion to sinkMBB
-  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  //  # abortion to fallMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
   thisMBB->addSuccessor(mainMBB);
-  thisMBB->addSuccessor(sinkMBB);
+  thisMBB->addSuccessor(fallMBB);
 
   // mainMBB:
-  //  EAX = -1
-  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  //  mainDstReg := -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
+  BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   mainMBB->addSuccessor(sinkMBB);
 
-  // sinkMBB:
-  // EAX is live into the sinkMBB
-  sinkMBB->addLiveIn(X86::EAX);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
-          MI.getOperand(0).getReg())
+  // fallMBB:
+  //  ; pseudo instruction to model hardware's definition from XABORT
+  //  EAX := XABORT_DEF
+  //  fallDstReg := EAX
+  BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
+  BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
       .addReg(X86::EAX);
+  fallMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
+      .addReg(mainDstReg).addMBB(mainMBB)
+      .addReg(fallDstReg).addMBB(fallMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
@@ -29574,7 +29594,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
-    SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+    SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
                                   DAG.getAllOnesConstant(DL, CondVT));
     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
@@ -31321,13 +31341,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           // See X86ATTInstPrinter.cpp:printSSECC().
           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
           if (Subtarget.hasAVX512()) {
-            SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
-                                         CMP01,
-                                         DAG.getConstant(x86cc, DL, MVT::i8));
-            if (N->getValueType(0) != MVT::i1)
-              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
-                                 FSetCC);
-            return FSetCC;
+            SDValue FSetCC =
+                DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
+                            DAG.getConstant(x86cc, DL, MVT::i8));
+            return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
+                               FSetCC, DAG.getIntPtrConstant(0, DL));
           }
           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
                                               CMP00.getValueType(), CMP00, CMP01,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 71d395244b4a..f9344413bbcf 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -31,8 +31,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
 
   // The mask VT.
-  ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
-                                                          "v" # NumElts # "i1"));
+  ValueType KVT = !cast<ValueType>("v" # NumElts # "i1");
 
   // Suffix used in the instruction mnemonic.
   string Suffix = suffix;
@@ -2263,7 +2262,7 @@ let Predicates = [HasAVX512, NoDQI] in {
 let Predicates = [HasAVX512] in {
   def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
-  def : Pat<(i1 (load addr:$src)),
+  def : Pat<(v1i1 (load addr:$src)),
             (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
   def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
             (KMOVWkm addr:$src)>;
@@ -2280,77 +2279,45 @@ let Predicates = [HasBWI] in {
 }
 
 let Predicates = [HasAVX512] in {
-  def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
-                                        (i32 1)), VK1)>;
+  multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> {
+    def : Pat<(maskVT (scalar_to_vector GR32:$src)),
+              (COPY_TO_REGCLASS GR32:$src, maskRC)>;
 
-  def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>;
+    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), 
+              (COPY_TO_REGCLASS maskRC:$src, GR32)>;
 
-  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
-            (COPY_TO_REGCLASS GR32:$src, VK1)>;
+    def : Pat<(maskVT (scalar_to_vector GR8:$src)),
+              (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
 
-  def : Pat<(i1 (trunc (i8 GR8:$src))),
-       (COPY_TO_REGCLASS
-        (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                 GR8:$src, sub_8bit), (i32 1)), VK1)>;
+    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), 
+              (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
 
-  def : Pat<(i1 (trunc (i16 GR16:$src))),
-       (COPY_TO_REGCLASS
-        (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                 GR16:$src, sub_16bit), (i32 1)), VK1)>;
-
-  def : Pat<(i32 (zext VK1:$src)),
-            (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>;
-
-  def : Pat<(i32 (anyext VK1:$src)),
-            (COPY_TO_REGCLASS VK1:$src, GR32)>;
-
-  def : Pat<(i8 (zext VK1:$src)),
-            (EXTRACT_SUBREG
-             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>;
-
-  def : Pat<(i8 (anyext VK1:$src)),
-            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
+    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), 
+              (COPY_TO_REGCLASS maskRC:$src, GR32)>;
+  }
 
-  def : Pat<(i64 (zext VK1:$src)),
-            (SUBREG_TO_REG (i64 0),
-             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>;
+  defm : operation_gpr_mask_copy_lowering<VK1,  v1i1>;
+  defm : operation_gpr_mask_copy_lowering<VK2,  v2i1>;
+  defm : operation_gpr_mask_copy_lowering<VK4,  v4i1>;
+  defm : operation_gpr_mask_copy_lowering<VK8,  v8i1>;
+  defm : operation_gpr_mask_copy_lowering<VK16,  v16i1>;
+  defm : operation_gpr_mask_copy_lowering<VK32,  v32i1>;
+  defm : operation_gpr_mask_copy_lowering<VK64,  v64i1>;
 
-  def : Pat<(i64 (anyext VK1:$src)),
-            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-             (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
+  def : Pat<(X86kshiftr  (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+          (COPY_TO_REGCLASS
+                (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                            GR8:$src, sub_8bit), (i32 1))), VK1)>;
+  def : Pat<(X86kshiftr  (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+            (COPY_TO_REGCLASS
+                (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                            GR8:$src, sub_8bit), (i32 1))), VK16)>;
+  def : Pat<(X86kshiftr  (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+         (COPY_TO_REGCLASS
+          (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                            GR8:$src, sub_8bit), (i32 1))), VK8)>;
 
-  def : Pat<(i16 (zext VK1:$src)),
-            (EXTRACT_SUBREG
-             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>;
-
-  def : Pat<(i16 (anyext VK1:$src)),
-            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
-}
-def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK16)>;
-def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK8)>;
-def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK4)>;
-def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK2)>;
-def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK32)>;
-def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK64)>;
-
-def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
-
-def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK8:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK8:$src,  VK1)>;
-def : Pat<(i1 (X86Vextract VK4:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK4:$src,  VK1)>;
-def : Pat<(i1 (X86Vextract VK2:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK2:$src,  VK1)>;
+}
 
 // Mask unary operation
 // - KNOT
@@ -2551,14 +2518,11 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
   def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
   def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
+  def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>;
   def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
   def : Pat<(v4i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK4)>;
   def : Pat<(v2i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK2)>;
-  let AddedComplexity = 10 in { // To optimize isel table.
-    def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
-    def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
-    def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
-  }
+  def : Pat<(v1i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
@@ -2570,6 +2534,12 @@ multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subV
   def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
             (VT (COPY_TO_REGCLASS subRC:$src, RC))>;
 }
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK2,  v2i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK4,  v4i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK8,  v8i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK64, v64i1>;
 
 defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK4,  v4i1>;
 defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK8,  v8i1>;
@@ -3249,7 +3219,7 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
                         (_.VT (scalar_to_vector
-                                  (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+                                  (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
                                                        (_.EltVT _.FRC:$src1),
                                                        (_.EltVT _.FRC:$src2))))))),
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
@@ -3260,7 +3230,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
                         (_.VT (scalar_to_vector
-                                  (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+                                  (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
                                                        (_.EltVT _.FRC:$src1),
                                                        (_.EltVT ZeroFP))))))),
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
@@ -3279,7 +3249,7 @@ def : Pat<(masked_store addr:$dst, Mask,
                                                  (iPTR 0))),
                                (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
-                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
                       (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 
 }
@@ -3296,7 +3266,7 @@ def : Pat<(masked_store addr:$dst, Mask,
                                                  (iPTR 0))),
                                (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
-                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
                       (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 
 }
@@ -3310,7 +3280,7 @@ def : Pat<(_.info128.VT (extract_subvector
                                                        (v16i32 immAllZerosV))))),
                            (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmkz)
-                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
                       addr:$srcAddr)>;
 
 def : Pat<(_.info128.VT (extract_subvector
@@ -3322,7 +3292,7 @@ def : Pat<(_.info128.VT (extract_subvector
                             (iPTR 0))))),
                 (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
-                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
                       addr:$srcAddr)>;
 
 }
@@ -3338,7 +3308,7 @@ def : Pat<(_.info128.VT (extract_subvector
                                                        (v16i32 immAllZerosV))))),
                            (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmkz)
-                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
                       addr:$srcAddr)>;
 
 def : Pat<(_.info128.VT (extract_subvector
@@ -3350,7 +3320,7 @@ def : Pat<(_.info128.VT (extract_subvector
                             (iPTR 0))))),
                 (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
-                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
                       addr:$srcAddr)>;
 
 }
@@ -3381,7 +3351,7 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
            VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
-          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)),
+          (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
 let hasSideEffects = 0 in
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 9867ba84bb9b..e2e228f5544b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -274,7 +274,7 @@ def X86select  : SDNode<"X86ISD::SELECT",
                                              SDTCisSameNumEltsAs<0, 1>]>>;
 
 def X86selects : SDNode<"X86ISD::SELECTS",
-                        SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
+                        SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<2, 3>]>>;
 
@@ -441,7 +441,7 @@ def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
                                             SDTCisSameNumEltsAs<0,1>,
                                             SDTCisVT<2, i32>]>, []>;
 def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
-                       SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+                       SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>,
                                             SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
 
 def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
@@ -451,7 +451,7 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
 def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
-                              [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+                              [SDTCisVec<1>,
                                SDTCisPtrTy<2>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 092ceb207ada..f7083a7448ce 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -10511,9 +10511,7 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
 
 void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
                                           MachineFunction &MF,
-                                          bool IsTailCall) const {
-  return;
-}
+                                          bool IsTailCall) const {}
 
 MachineBasicBlock::iterator
 X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 4d7d8ece92d9..01df07e1715f 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -896,9 +896,16 @@ def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
 def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
                              "TM.getCodeModel() == CodeModel::Kernel">;
 def IsNotPIC     : Predicate<"!TM.isPositionIndependent()">;
-def OptForSize   : Predicate<"Subtarget->getOptForSize()">;
-def OptForMinSize : Predicate<"Subtarget->getOptForMinSize()">;
-def OptForSpeed  : Predicate<"!Subtarget->getOptForSize()">;
+
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+  def OptForSize   : Predicate<"MF->getFunction()->optForSize()">;
+  def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">;
+  def OptForSpeed  : Predicate<"!MF->getFunction()->optForSize()">;
+}
+
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
 def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 38ac8be94483..61aac58a491f 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -30,6 +30,11 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
                          "xbegin\t$dst", []>, OpSize32;
 }
 
+// Psuedo instruction to fake the definition of EAX on the fallback code path.
+let isPseudo = 1, Defs = [EAX] in {
+def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
+}
+
 def XEND : I<0x01, MRM_D5, (outs), (ins),
              "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
 
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 5eb5ad52840a..61956f741820 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -449,24 +449,30 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
   if (!SrcRC)
     return false;
 
-  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
-      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
-    return false;
-  }
-
+  unsigned SubIdx;
   if (DstRC == SrcRC) {
     // Nothing to be done
+    SubIdx = X86::NoSubRegister;
   } else if (DstRC == &X86::GR32RegClass) {
-    I.getOperand(1).setSubReg(X86::sub_32bit);
+    SubIdx = X86::sub_32bit;
   } else if (DstRC == &X86::GR16RegClass) {
-    I.getOperand(1).setSubReg(X86::sub_16bit);
+    SubIdx = X86::sub_16bit;
   } else if (DstRC == &X86::GR8RegClass) {
-    I.getOperand(1).setSubReg(X86::sub_8bit);
+    SubIdx = X86::sub_8bit;
   } else {
     return false;
   }
 
+  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    return false;
+  }
+
+  I.getOperand(1).setSubReg(SubIdx);
+
   I.setDesc(TII.get(X86::COPY));
   return true;
 }
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 8ce240714f17..da724f5d8989 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -184,6 +184,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() {
     return;
 
   const LLT s64 = LLT::scalar(64);
+  const LLT v16s8 = LLT::vector(16, 8);
   const LLT v8s16 = LLT::vector(8, 16);
   const LLT v4s32 = LLT::vector(4, 32);
   const LLT v2s64 = LLT::vector(2, 64);
@@ -193,7 +194,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() {
       setAction({BinOp, Ty}, Legal);
 
   for (unsigned BinOp : {G_ADD, G_SUB})
-    for (auto Ty : {v4s32})
+    for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
       setAction({BinOp, Ty}, Legal);
 
   setAction({G_MUL, v8s16}, Legal);
@@ -212,8 +213,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() {
   if (!Subtarget.hasAVX2())
     return;
 
+  const LLT v32s8 = LLT::vector(32, 8);
   const LLT v16s16 = LLT::vector(16, 16);
   const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+      setAction({BinOp, Ty}, Legal);
 
   for (auto Ty : {v16s16, v8s32})
     setAction({G_MUL, Ty}, Legal);
@@ -224,6 +231,11 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
     return;
 
   const LLT v16s32 = LLT::vector(16, 32);
+  const LLT v8s64 = LLT::vector(8, 64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v16s32, v8s64})
+      setAction({BinOp, Ty}, Legal);
 
   setAction({G_MUL, v16s32}, Legal);
 
@@ -261,8 +273,13 @@ void X86LegalizerInfo::setLegalizerInfoAVX512BW() {
   if (!(Subtarget.hasAVX512() && Subtarget.hasBWI()))
     return;
 
+  const LLT v64s8 = LLT::vector(64, 8);
   const LLT v32s16 = LLT::vector(32, 16);
 
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v64s8, v32s16})
+      setAction({BinOp, Ty}, Legal);
+
   setAction({G_MUL, v32s16}, Legal);
 
   /************ VLX *******************/
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index d235d2b40b15..3a61a7247c72 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -511,7 +511,7 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
-def VK1     : RegisterClass<"X86", [i1],    16,  (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK1     : RegisterClass<"X86", [v1i1],  16,  (sequence "K%u", 0, 7)> {let Size = 16;}
 def VK2     : RegisterClass<"X86", [v2i1],  16,  (add VK1)> {let Size = 16;}
 def VK4     : RegisterClass<"X86", [v4i1],  16,  (add VK2)> {let Size = 16;}
 def VK8     : RegisterClass<"X86", [v8i1],  16,  (add VK4)> {let Size = 16;}
@@ -519,7 +519,7 @@ def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
 def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
 def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
 
-def VK1WM   : RegisterClass<"X86", [i1],    16,  (sub VK1, K0)> {let Size = 16;}
+def VK1WM   : RegisterClass<"X86", [v1i1],  16,  (sub VK1, K0)> {let Size = 16;}
 def VK2WM   : RegisterClass<"X86", [v2i1],  16,  (sub VK2, K0)> {let Size = 16;}
 def VK4WM   : RegisterClass<"X86", [v4i1],  16,  (sub VK4, K0)> {let Size = 16;}
 def VK8WM   : RegisterClass<"X86", [v8i1],  16,  (sub VK8, K0)> {let Size = 16;}
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index d66d39dcee17..2b1f43bffd71 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -320,6 +320,7 @@ void X86Subtarget::initializeEnvironment() {
   CallRegIndirect = false;
   LEAUsesAG = false;
   SlowLEA = false;
+  Slow3OpsLEA = false;
   SlowIncDec = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
@@ -336,8 +337,7 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
 
 X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                            const X86TargetMachine &TM,
-                           unsigned StackAlignOverride, bool OptForSize,
-                           bool OptForMinSize)
+                           unsigned StackAlignOverride)
     : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
       PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
       StackAlignOverride(StackAlignOverride),
@@ -347,8 +347,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering(*this, getStackAlignment()), OptForSize(OptForSize),
-      OptForMinSize(OptForMinSize) {
+      FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (!isPositionIndependent())
     setPICStyle(PICStyles::None);
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de1514243aeb..a9f3a2aee1be 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,6 +253,11 @@ protected:
   /// True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
+  /// True if the LEA instruction has all three source operands: base, index,
+  /// and offset or if the LEA instruction uses base and index registers where
+  /// the base is EBP, RBP,or R13
+  bool Slow3OpsLEA;
+
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
@@ -331,16 +336,12 @@ private:
   X86TargetLowering TLInfo;
   X86FrameLowering FrameLowering;
 
-  bool OptForSize;
-  bool OptForMinSize;
-
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
-               const X86TargetMachine &TM, unsigned StackAlignOverride,
-               bool OptForSize, bool OptForMinSize);
+               const X86TargetMachine &TM, unsigned StackAlignOverride);
 
   /// This object will take onwership of \p GISelAccessor.
   void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
@@ -490,6 +491,7 @@ public:
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
+  bool slow3OpsLEA() const { return Slow3OpsLEA; }
   bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
@@ -507,9 +509,6 @@ public:
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
   bool useSoftFloat() const { return UseSoftFloat; }
 
-  bool getOptForSize() const { return OptForSize; }
-  bool getOptForMinSize() const { return OptForMinSize; }
-
   /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
   /// no-sse2). There isn't any reason to disable it if the target processor
   /// supports it.
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 9a82e6e50463..53a8e83b36fc 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
 namespace llvm {
 
 void initializeWinEHStatePassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
 void initializeX86ExecutionDepsFixPass(PassRegistry &);
 
 } // end namespace llvm
@@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() {
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
+  initializeFixupLEAPassPass(PR);
   initializeX86ExecutionDepsFixPass(PR);
 }
 
@@ -268,12 +270,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 
   FS = Key.substr(CPU.size());
 
-  bool OptForSize = F.optForSize();
-  bool OptForMinSize = F.optForMinSize();
-
-  Key += std::string(OptForSize ? "+" : "-") + "optforsize";
-  Key += std::string(OptForMinSize ? "+" : "-") + "optforminsize";
-
   auto &I = SubtargetMap[Key];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
@@ -281,8 +277,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
-                                        Options.StackAlignmentOverride,
-                                        OptForSize, OptForMinSize);
+                                        Options.StackAlignmentOverride);
 #ifndef LLVM_BUILD_GLOBAL_ISEL
     GISelAccessor *GISel = new GISelAccessor();
 #else
@@ -378,12 +373,12 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void X86PassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass(&getX86TargetMachine()));
+  addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createInterleavedAccessPass(TM));
+    addPass(createInterleavedAccessPass());
 }
 
 bool X86PassConfig::addInstSelector() {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 8566bd91c89e..fe94079fd869 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1392,15 +1392,47 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+  static const CostTblEntry AVX512CDCostTbl[] = {
+    { ISD::CTLZ,       MVT::v8i64,   1 },
+    { ISD::CTLZ,       MVT::v16i32,  1 },
+    { ISD::CTLZ,       MVT::v32i16,  8 },
+    { ISD::CTLZ,       MVT::v64i8,  20 },
+    { ISD::CTLZ,       MVT::v4i64,   1 },
+    { ISD::CTLZ,       MVT::v8i32,   1 },
+    { ISD::CTLZ,       MVT::v16i16,  4 },
+    { ISD::CTLZ,       MVT::v32i8,  10 },
+    { ISD::CTLZ,       MVT::v2i64,   1 },
+    { ISD::CTLZ,       MVT::v4i32,   1 },
+    { ISD::CTLZ,       MVT::v8i16,   4 },
+    { ISD::CTLZ,       MVT::v16i8,   4 },
+  };
   static const CostTblEntry AVX512BWCostTbl[] = {
     { ISD::BITREVERSE, MVT::v8i64,   5 },
     { ISD::BITREVERSE, MVT::v16i32,  5 },
     { ISD::BITREVERSE, MVT::v32i16,  5 },
     { ISD::BITREVERSE, MVT::v64i8,   5 },
+    { ISD::CTLZ,       MVT::v8i64,  23 },
+    { ISD::CTLZ,       MVT::v16i32, 22 },
+    { ISD::CTLZ,       MVT::v32i16, 18 },
+    { ISD::CTLZ,       MVT::v64i8,  17 },
+    { ISD::CTPOP,      MVT::v8i64,   7 },
+    { ISD::CTPOP,      MVT::v16i32, 11 },
+    { ISD::CTPOP,      MVT::v32i16,  9 },
+    { ISD::CTPOP,      MVT::v64i8,   6 },
+    { ISD::CTTZ,       MVT::v8i64,  10 },
+    { ISD::CTTZ,       MVT::v16i32, 14 },
+    { ISD::CTTZ,       MVT::v32i16, 12 },
+    { ISD::CTTZ,       MVT::v64i8,   9 },
   };
   static const CostTblEntry AVX512CostTbl[] = {
     { ISD::BITREVERSE, MVT::v8i64,  36 },
     { ISD::BITREVERSE, MVT::v16i32, 24 },
+    { ISD::CTLZ,       MVT::v8i64,  29 },
+    { ISD::CTLZ,       MVT::v16i32, 35 },
+    { ISD::CTPOP,      MVT::v8i64,  16 },
+    { ISD::CTPOP,      MVT::v16i32, 24 },
+    { ISD::CTTZ,       MVT::v8i64,  20 },
+    { ISD::CTTZ,       MVT::v16i32, 28 },
   };
   static const CostTblEntry XOPCostTbl[] = {
     { ISD::BITREVERSE, MVT::v4i64,   4 },
@@ -1560,6 +1592,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   MVT MTy = LT.second;
 
   // Attempt to lookup cost.
+  if (ST->hasCDI())
+    if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   if (ST->hasBWI())
     if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index e28e05c7f6a8..2950e2efbea3 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -74,7 +74,7 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void XCorePassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass(&getXCoreTargetMachine()));
+  addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 203594572618..ec06d5f9fb05 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -44,8 +44,12 @@
 using namespace llvm;
 
 static cl::opt<bool>
-RunLoopVectorization("vectorize-loops", cl::Hidden,
-                     cl::desc("Run the Loop vectorization passes"));
+    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
+                       cl::ZeroOrMore, cl::desc("Run Partial inlinining pass"));
+
+static cl::opt<bool>
+    RunLoopVectorization("vectorize-loops", cl::Hidden,
+                         cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
 RunSLPVectorization("vectorize-slp", cl::Hidden,
@@ -516,6 +520,8 @@ void PassManagerBuilder::populateModulePassManager(
   // pass manager that we are specifically trying to avoid. To prevent this
   // we must insert a no-op module pass to reset the pass manager.
   MPM.add(createBarrierNoopPass());
+  if (RunPartialInlining)
+    MPM.add(createPartialInliningPass());
 
   if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO &&
       !PrepareForThinLTO)
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0ca62b7ae40c..733eeb1767a3 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -852,8 +852,9 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
 /// TODO: Handle this for Vectors.
-bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
+bool InstCombiner::willNotOverflowSignedSub(const Value *LHS,
+                                            const Value *RHS,
+                                            const Instruction &CxtI) const {
   // If LHS and RHS each have at least two sign bits, the subtraction
   // cannot overflow.
   if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
@@ -869,8 +870,8 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
 
   // Subtraction of two 2's complement numbers having identical signs will
   // never overflow.
-  if ((LHSKnown.One[BitWidth - 1] && RHSKnown.One[BitWidth - 1]) ||
-      (LHSKnown.Zero[BitWidth - 1] && RHSKnown.Zero[BitWidth - 1]))
+  if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
     return true;
 
   // TODO: implement logic similar to checkRippleForAdd
@@ -879,8 +880,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
 
 /// \brief Return true if we can prove that:
 ///    (sub LHS, RHS)  === (sub nuw LHS, RHS)
-bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
-                                              Instruction &CxtI) {
+bool InstCombiner::willNotOverflowUnsignedSub(const Value *LHS,
+                                              const Value *RHS,
+                                              const Instruction &CxtI) const {
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
   KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
   KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
@@ -1180,7 +1182,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
         if (ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
-            WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
+            willNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
           // Insert the new, smaller add.
           Value *NewAdd =
               Builder->CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv");
@@ -1197,7 +1199,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       if (LHSConv->getOperand(0)->getType() ==
               RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+          willNotOverflowSignedAdd(LHSConv->getOperand(0),
                                    RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
@@ -1275,10 +1277,10 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
-  // TODO(jingyue): Consider WillNotOverflowSignedAdd and
+  // TODO(jingyue): Consider willNotOverflowSignedAdd and
   // willNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
@@ -1351,7 +1353,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
           ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
         if (LHSConv->hasOneUse() &&
             ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
-            WillNotOverflowSignedAdd(LHSIntVal, CI, I)) {
+            willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
           // Insert the new integer add.
           Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal,
                                                 CI, "addconv");
@@ -1370,7 +1372,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
         // and if the integer add will not overflow.
         if (LHSIntVal->getType() == RHSIntVal->getType() &&
             (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-            WillNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
+            willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
           // Insert the new integer add.
           Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal,
                                                 RHSIntVal, "addconv");
@@ -1676,11 +1678,11 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return replaceInstUsesWith(I, Res);
 
   bool Changed = false;
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, I)) {
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, I)) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 82dc88f1b3ad..4227b2d01be8 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -764,7 +764,7 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
 }
 
 /// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
@@ -943,7 +943,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
 /// Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of instcombine, this returns
 /// a Value which should already be inserted into the function.
-Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+Value *InstCombiner::foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
   Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
   FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
@@ -1126,8 +1126,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
   ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
   ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
   if (ICmp0 && ICmp1) {
-    Value *Res = LogicOpc == Instruction::And ? FoldAndOfICmps(ICmp0, ICmp1)
-                                              : FoldOrOfICmps(ICmp0, ICmp1, &I);
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1)
+                                              : foldOrOfICmps(ICmp0, ICmp1, &I);
     if (Res)
       return CastInst::Create(CastOpcode, Res, DestTy);
     return nullptr;
@@ -1138,8 +1138,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
   FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
   FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
   if (FCmp0 && FCmp1) {
-    Value *Res = LogicOpc == Instruction::And ? FoldAndOfFCmps(FCmp0, FCmp1)
-                                              : FoldOrOfFCmps(FCmp0, FCmp1);
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfFCmps(FCmp0, FCmp1)
+                                              : foldOrOfFCmps(FCmp0, FCmp1);
     if (Res)
       return CastInst::Create(CastOpcode, Res, DestTy);
     return nullptr;
@@ -1425,7 +1425,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = FoldAndOfICmps(LHS, RHS))
+      if (Value *Res = foldAndOfICmps(LHS, RHS))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -1433,18 +1433,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldAndOfICmps(LHS, Cmp))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldAndOfICmps(LHS, Cmp))
+        if (Value *Res = foldAndOfICmps(LHS, Cmp))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldAndOfICmps(Cmp, RHS))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldAndOfICmps(Cmp, RHS))
+        if (Value *Res = foldAndOfICmps(Cmp, RHS))
           return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
     }
   }
@@ -1452,7 +1452,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   // If and'ing two fcmp, try combine them into one.
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = FoldAndOfFCmps(LHS, RHS))
+      if (Value *Res = foldAndOfFCmps(LHS, RHS))
         return replaceInstUsesWith(I, Res);
 
   if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
@@ -1589,7 +1589,7 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
 }
 
 /// Fold (icmp)|(icmp) if possible.
-Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                    Instruction *CxtI) {
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
@@ -1846,7 +1846,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 
 /// Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of instcombine, this returns
 /// a Value which should already be inserted into the function.
-Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+Value *InstCombiner::foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
   Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
   FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
@@ -2197,7 +2197,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
+      if (Value *Res = foldOrOfICmps(LHS, RHS, &I))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -2205,18 +2205,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, &I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, &I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, &I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, &I))
           return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
     }
   }
@@ -2224,7 +2224,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = FoldOrOfFCmps(LHS, RHS))
+      if (Value *Res = foldOrOfFCmps(LHS, RHS))
         return replaceInstUsesWith(I, Res);
 
   if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
@@ -2320,6 +2320,24 @@ static Instruction *foldXorToXor(BinaryOperator &I) {
   return nullptr;
 }
 
+Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+  if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -2579,23 +2597,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       match(Op1, m_Not(m_Specific(A))))
     return BinaryOperator::CreateNot(Builder->CreateAnd(A, B));
 
-  // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
-  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
-    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
-      if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
-        if (LHS->getOperand(0) == RHS->getOperand(1) &&
-            LHS->getOperand(1) == RHS->getOperand(0))
-          LHS->swapOperands();
-        if (LHS->getOperand(0) == RHS->getOperand(0) &&
-            LHS->getOperand(1) == RHS->getOperand(1)) {
-          Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
-          unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
-          bool isSigned = LHS->isSigned() || RHS->isSigned();
-          return replaceInstUsesWith(I,
-                               getNewICmpValue(isSigned, Code, Op0, Op1,
-                                               Builder));
-        }
-      }
+  if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+    if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+      if (Value *V = foldXorOfICmps(LHS, RHS))
+        return replaceInstUsesWith(I, V);
 
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
     return CastedXor;
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 001a4bcf16f3..f4bf5221f6a2 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -585,6 +585,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     return CastInst::CreateIntegerCast(Shift, DestTy, false);
   }
 
+  // FIXME: We should canonicalize to zext/trunc and remove this transform.
   // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type
   // conversion.
   // It works because bits coming from sign extension have the same value as
@@ -595,18 +596,24 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Value *SExt = cast<Instruction>(Src)->getOperand(0);
     const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
     const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    const unsigned CISize = CI.getType()->getPrimitiveSizeInBits();
+    const unsigned MaxAmt = SExtSize - std::max(CISize, ASize);
     unsigned ShiftAmt = Cst->getZExtValue();
+
     // This optimization can be only performed when zero bits generated by
     // the original lshr aren't pulled into the value after truncation, so we
     // can only shift by values no larger than the number of extension bits.
     // FIXME: Instead of bailing when the shift is too large, use and to clear
     // the extra bits.
-    if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) {
-      // If shifting by the size of the original value in bits or more, it is
-      // being filled with the sign bit, so shift by ASize-1 to avoid ub.
-      Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
-      Shift->takeName(Src);
-      return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+    if (ShiftAmt <= MaxAmt) {
+      if (CISize == ASize)
+        return BinaryOperator::CreateAShr(A, ConstantInt::get(CI.getType(),
+                                          std::min(ShiftAmt, ASize - 1)));
+      if (SExt->hasOneUse()) {
+        Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
+        Shift->takeName(Src);
+        return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+      }
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 60ed4057cedd..6492eaedae9c 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3506,7 +3506,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
     // We can strength reduce this signed add into a regular add if we can prove
     // that it will never overflow.
     if (OCF == OCF_SIGNED_ADD)
-      if (WillNotOverflowSignedAdd(LHS, RHS, OrigI))
+      if (willNotOverflowSignedAdd(LHS, RHS, OrigI))
         return SetResult(Builder->CreateNSWAdd(LHS, RHS), Builder->getFalse(),
                          true);
     break;
@@ -3519,11 +3519,11 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
       return SetResult(LHS, Builder->getFalse(), false);
 
     if (OCF == OCF_SIGNED_SUB) {
-      if (WillNotOverflowSignedSub(LHS, RHS, OrigI))
+      if (willNotOverflowSignedSub(LHS, RHS, OrigI))
         return SetResult(Builder->CreateNSWSub(LHS, RHS), Builder->getFalse(),
                          true);
     } else {
-      if (WillNotOverflowUnsignedSub(LHS, RHS, OrigI))
+      if (willNotOverflowUnsignedSub(LHS, RHS, OrigI))
         return SetResult(Builder->CreateNUWSub(LHS, RHS), Builder->getFalse(),
                          true);
     }
@@ -3553,7 +3553,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
       return SetResult(LHS, Builder->getFalse(), false);
 
     if (OCF == OCF_SIGNED_MUL)
-      if (WillNotOverflowSignedMul(LHS, RHS, OrigI))
+      if (willNotOverflowSignedMul(LHS, RHS, OrigI))
         return SetResult(Builder->CreateNSWMul(LHS, RHS), Builder->getFalse(),
                          true);
     break;
@@ -4260,6 +4260,80 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
 }
 
+/// Integer compare with boolean values can always be turned into bitwise ops.
+static Instruction *canonicalizeICmpBool(ICmpInst &I,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *A = I.getOperand(0), *B = I.getOperand(1);
+  assert(A->getType()->getScalarType()->isIntegerTy(1) && "Bools only");
+
+  // A boolean compared to true/false can be simplified to Op0/true/false in
+  // 14 out of the 20 (10 predicates * 2 constants) possible combinations.
+  // Cases not handled by InstSimplify are always 'not' of Op0.
+  if (match(B, m_Zero())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_EQ:  // A ==   0 -> !A
+      case CmpInst::ICMP_ULE: // A <=u  0 -> !A
+      case CmpInst::ICMP_SGE: // A >=s  0 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  } else if (match(B, m_One())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_NE:  // A !=  1 -> !A
+      case CmpInst::ICMP_ULT: // A <u  1 -> !A
+      case CmpInst::ICMP_SGT: // A >s -1 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  }
+
+  switch (I.getPredicate()) {
+  default:
+    llvm_unreachable("Invalid icmp instruction!");
+  case ICmpInst::ICMP_EQ:
+    // icmp eq i1 A, B -> ~(A ^ B)
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  case ICmpInst::ICMP_NE:
+    // icmp ne i1 A, B -> A ^ B
+    return BinaryOperator::CreateXor(A, B);
+
+  case ICmpInst::ICMP_UGT:
+    // icmp ugt -> icmp ult
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULT:
+    // icmp ult i1 A, B -> ~A & B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGT:
+    // icmp sgt -> icmp slt
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLT:
+    // icmp slt i1 A, B -> A & ~B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
+
+  case ICmpInst::ICMP_UGE:
+    // icmp uge -> icmp ule
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULE:
+    // icmp ule i1 A, B -> ~A | B
+    return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGE:
+    // icmp sge -> icmp sle
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLE:
+    // icmp sle i1 A, B -> A | ~B
+    return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
+  }
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -4297,49 +4371,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
   }
 
-  Type *Ty = Op0->getType();
-
-  // icmp's with boolean values can always be turned into bitwise operations
-  if (Ty->getScalarType()->isIntegerTy(1)) {
-    switch (I.getPredicate()) {
-    default: llvm_unreachable("Invalid icmp instruction!");
-    case ICmpInst::ICMP_EQ: {                // icmp eq i1 A, B -> ~(A^B)
-      Value *Xor = Builder->CreateXor(Op0, Op1, I.getName() + "tmp");
-      return BinaryOperator::CreateNot(Xor);
-    }
-    case ICmpInst::ICMP_NE:                  // icmp ne i1 A, B -> A^B
-      return BinaryOperator::CreateXor(Op0, Op1);
-
-    case ICmpInst::ICMP_UGT:
-      std::swap(Op0, Op1);                   // Change icmp ugt -> icmp ult
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_ULT:{                // icmp ult i1 A, B -> ~A & B
-      Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
-      return BinaryOperator::CreateAnd(Not, Op1);
-    }
-    case ICmpInst::ICMP_SGT:
-      std::swap(Op0, Op1);                   // Change icmp sgt -> icmp slt
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_SLT: {               // icmp slt i1 A, B -> A & ~B
-      Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
-      return BinaryOperator::CreateAnd(Not, Op0);
-    }
-    case ICmpInst::ICMP_UGE:
-      std::swap(Op0, Op1);                   // Change icmp uge -> icmp ule
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_ULE: {               // icmp ule i1 A, B -> ~A | B
-      Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
-      return BinaryOperator::CreateOr(Not, Op1);
-    }
-    case ICmpInst::ICMP_SGE:
-      std::swap(Op0, Op1);                   // Change icmp sge -> icmp sle
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_SLE: {               // icmp sle i1 A, B -> A | ~B
-      Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
-      return BinaryOperator::CreateOr(Not, Op0);
-    }
-    }
-  }
+  if (Op0->getType()->getScalarType()->isIntegerTy(1))
+    if (Instruction *Res = canonicalizeICmpBool(I, *Builder))
+      return Res;
 
   if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
     return NewICmp;
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index f88a2c6acc3f..6829be86885b 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -275,11 +275,7 @@ public:
   Instruction *visitSDiv(BinaryOperator &I);
   Instruction *visitFDiv(BinaryOperator &I);
   Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
-  Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
-  Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
   Instruction *visitAnd(BinaryOperator &I);
-  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
-  Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
   Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
                                    Value *B, Value *C);
   Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
@@ -410,18 +406,24 @@ private:
                                  bool DoTransform = true);
 
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+  bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const {
     return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
            OverflowResult::NeverOverflows;
   };
-  bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+  bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
     return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
            OverflowResult::NeverOverflows;
   };
-  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
-  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
-  bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
-  bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) {
+  bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const;
+  bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const;
+  bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const;
+  bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
     return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
            OverflowResult::NeverOverflows;
   };
@@ -445,6 +447,12 @@ private:
   Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
                                             const CastInst *CI2);
 
+  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
+  Value *foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+
 public:
   /// \brief Inserts an instruction \p New before instruction \p Old
   ///
@@ -523,29 +531,31 @@ public:
     return nullptr; // Don't do anything with FI
   }
 
-  void computeKnownBits(Value *V, KnownBits &Known,
-                        unsigned Depth, Instruction *CxtI) const {
+  void computeKnownBits(const Value *V, KnownBits &Known,
+                        unsigned Depth, const Instruction *CxtI) const {
     llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
   }
-  KnownBits computeKnownBits(Value *V, unsigned Depth,
-                             Instruction *CxtI) const {
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
     return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
   }
 
-  bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
-                         Instruction *CxtI = nullptr) const {
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
     return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
   }
-  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0,
-                              Instruction *CxtI = nullptr) const {
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
     return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
   }
-  OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
-                                               const Instruction *CxtI) {
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
     return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
   }
-  OverflowResult computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
-                                               const Instruction *CxtI) {
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
     return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
   OverflowResult computeOverflowForSignedAdd(const Value *LHS,
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 2a35259f2103..fc13854f8fe7 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -132,8 +132,9 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) {
 
 /// \brief Return true if we can prove that:
 ///    (mul LHS, RHS)  === (mul nsw LHS, RHS)
-bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
+bool InstCombiner::willNotOverflowSignedMul(const Value *LHS,
+                                            const Value *RHS,
+                                            const Instruction &CxtI) const {
   // Multiplying n * m significant bits yields a result of n + m significant
   // bits. If the total number of significant bits does not exceed the
   // result bit width (minus 1), there is no overflow.
@@ -384,7 +385,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
         if (ConstantExpr::getSExt(CI, I.getType()) == Op1C &&
-            WillNotOverflowSignedMul(Op0Conv->getOperand(0), CI, I)) {
+            willNotOverflowSignedMul(Op0Conv->getOperand(0), CI, I)) {
           // Insert the new, smaller mul.
           Value *NewMul =
               Builder->CreateNSWMul(Op0Conv->getOperand(0), CI, "mulconv");
@@ -401,7 +402,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (Op0Conv->getOperand(0)->getType() ==
               Op1Conv->getOperand(0)->getType() &&
           (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
-          WillNotOverflowSignedMul(Op0Conv->getOperand(0),
+          willNotOverflowSignedMul(Op0Conv->getOperand(0),
                                    Op1Conv->getOperand(0), I)) {
         // Insert the new integer mul.
         Value *NewMul = Builder->CreateNSWMul(
@@ -447,7 +448,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, I)) {
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index d8f8a58a5fdf..c4f450949e6d 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -606,7 +606,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         if (unsigned Count = replaceDominatedUsesWith(
                 CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
           Changed = true;
-          NumCSECVP = NumCSECVP + Count;
+          NumCSECVP += Count;
         }
       }
     }
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index c04646eed49a..0490d93f6455 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -2057,7 +2057,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
       // If we failed insertion, make sure we remove the instruction.
       DEBUG(verifyRemoved(PREInstr));
-      delete PREInstr;
+      PREInstr->deleteValue();
       return false;
     }
   }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ae353ea44595..ada22ae38eb8 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -833,15 +833,13 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         CondBr->eraseFromParent();
         if (CondCmp->use_empty())
           CondCmp->eraseFromParent();
-        else if (CondCmp->getParent() == BB) {
-          // If the fact we just learned is true for all uses of the
-          // condition, replace it with a constant value
-          auto *CI = Ret == LazyValueInfo::True ?
-            ConstantInt::getTrue(CondCmp->getType()) :
-            ConstantInt::getFalse(CondCmp->getType());
-          CondCmp->replaceAllUsesWith(CI);
-          CondCmp->eraseFromParent();
-        }
+        // TODO: We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
         return true;
       }
 
@@ -1327,14 +1325,13 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
       if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
         if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
           CondInst->eraseFromParent();
-        else if (OnlyVal && OnlyVal != MultipleVal &&
-                 CondInst->getParent() == BB) {
-          // If we just learned Cond is the same value for all uses of the
-          // condition, replace it with a constant value
-          CondInst->replaceAllUsesWith(OnlyVal);
-          if (!CondInst->mayHaveSideEffects())
-            CondInst->eraseFromParent();
-        }
+        // TODO: We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
       }
       return true;
     }
@@ -1909,7 +1906,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
             {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
       ValueMapping[&*BI] = IV;
       if (!New->mayHaveSideEffects()) {
-        delete New;
+        New->deleteValue();
         New = nullptr;
       }
     } else {
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 02215d3450c2..494cbc61bc9c 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -228,7 +228,7 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
     L.Load->replaceAllUsesWith(V);
   }
 
-  NumLoadsCombined = NumLoadsCombined + Loads.size();
+  NumLoadsCombined += Loads.size();
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index cb6223b070a6..97337ea5ba62 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -110,6 +110,15 @@ private:
   bool HasMemset;
   bool HasMemsetPattern;
   bool HasMemcpy;
+  /// Return code for isLegalStore()
+  enum LegalStoreKind {
+    None = 0,
+    Memset,
+    MemsetPattern,
+    Memcpy,
+    DontUse // Dummy retval never to be used. Allows catching errors in retval
+            // handling.
+  };
 
   /// \name Countable Loop Idiom Handling
   /// @{
@@ -119,8 +128,7 @@ private:
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   void collectStores(BasicBlock *BB);
-  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern,
-                    bool &ForMemcpy);
+  LegalStoreKind isLegalStore(StoreInst *SI);
   bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
                          bool ForMemset);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
@@ -343,20 +351,20 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
   return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
 }
 
-bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
-                                      bool &ForMemsetPattern, bool &ForMemcpy) {
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
   // Don't touch volatile stores.
   if (!SI->isSimple())
-    return false;
+    return LegalStoreKind::None;
 
   // Don't convert stores of non-integral pointer types to memsets (which stores
   // integers).
   if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
-    return false;
+    return LegalStoreKind::None;
 
   // Avoid merging nontemporal stores.
   if (SI->getMetadata(LLVMContext::MD_nontemporal))
-    return false;
+    return LegalStoreKind::None;
 
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
@@ -364,7 +372,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
   // Reject stores that are so large that they overflow an unsigned.
   uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
   if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
-    return false;
+    return LegalStoreKind::None;
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided store.  If we have something else, it's a
@@ -372,11 +380,11 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
   const SCEVAddRecExpr *StoreEv =
       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
   if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
-    return false;
+    return LegalStoreKind::None;
 
   // Check to see if we have a constant stride.
   if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
-    return false;
+    return LegalStoreKind::None;
 
   // See if the store can be turned into a memset.
 
@@ -394,15 +402,13 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
       // promote the memset.
       CurLoop->isLoopInvariant(SplatValue)) {
     // It looks like we can use SplatValue.
-    ForMemset = true;
-    return true;
+    return LegalStoreKind::Memset;
   } else if (HasMemsetPattern &&
              // Don't create memset_pattern16s with address spaces.
              StorePtr->getType()->getPointerAddressSpace() == 0 &&
              (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
     // It looks like we can use PatternValue!
-    ForMemsetPattern = true;
-    return true;
+    return LegalStoreKind::MemsetPattern;
   }
 
   // Otherwise, see if the store can be turned into a memcpy.
@@ -412,12 +418,12 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
     APInt Stride = getStoreStride(StoreEv);
     unsigned StoreSize = getStoreSizeInBytes(SI, DL);
     if (StoreSize != Stride && StoreSize != -Stride)
-      return false;
+      return LegalStoreKind::None;
 
     // The store must be feeding a non-volatile load.
     LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
     if (!LI || !LI->isSimple())
-      return false;
+      return LegalStoreKind::None;
 
     // See if the pointer expression is an AddRec like {base,+,1} on the current
     // loop, which indicates a strided load.  If we have something else, it's a
@@ -425,18 +431,17 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
     const SCEVAddRecExpr *LoadEv =
         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
     if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
-      return false;
+      return LegalStoreKind::None;
 
     // The store and load must share the same stride.
     if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
-      return false;
+      return LegalStoreKind::None;
 
     // Success.  This store can be converted into a memcpy.
-    ForMemcpy = true;
-    return true;
+    return LegalStoreKind::Memcpy;
   }
   // This store can't be transformed into a memset/memcpy.
-  return false;
+  return LegalStoreKind::None;
 }
 
 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
@@ -448,24 +453,28 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
     if (!SI)
       continue;
 
-    bool ForMemset = false;
-    bool ForMemsetPattern = false;
-    bool ForMemcpy = false;
     // Make sure this is a strided store with a constant stride.
-    if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy))
-      continue;
-
-    // Save the store locations.
-    if (ForMemset) {
+    switch (isLegalStore(SI)) {
+    case LegalStoreKind::None:
+      // Nothing to do
+      break;
+    case LegalStoreKind::Memset: {
       // Find the base pointer.
       Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
       StoreRefsForMemset[Ptr].push_back(SI);
-    } else if (ForMemsetPattern) {
+    } break;
+    case LegalStoreKind::MemsetPattern: {
       // Find the base pointer.
       Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
       StoreRefsForMemsetPattern[Ptr].push_back(SI);
-    } else if (ForMemcpy)
+    } break;
+    case LegalStoreKind::Memcpy:
       StoreRefsForMemcpy.push_back(SI);
+      break;
+    default:
+      assert(false && "unhandled return value");
+      break;
+    }
   }
 }
 
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 0ce604429326..32fd3da465fe 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -58,12 +58,30 @@ using namespace llvm;
 
 namespace {
 class LoopPredication {
+  /// Represents an induction variable check:
+  ///   icmp Pred, <induction variable>, <loop invariant limit>
+  struct LoopICmp {
+    ICmpInst::Predicate Pred;
+    const SCEVAddRecExpr *IV;
+    const SCEV *Limit;
+    LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+             const SCEV *Limit)
+        : Pred(Pred), IV(IV), Limit(Limit) {}
+    LoopICmp() {}
+  };
+
   ScalarEvolution *SE;
 
   Loop *L;
   const DataLayout *DL;
   BasicBlock *Preheader;
 
+  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+
+  Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
+                     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+                     Instruction *InsertAt);
+
   Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
                                         IRBuilder<> &Builder);
   bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
@@ -116,16 +134,10 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
   return getLoopPassPreservedAnalyses();
 }
 
-/// If ICI can be widened to a loop invariant condition emits the loop
-/// invariant condition in the loop preheader and return it, otherwise
-/// returns None.
-Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
-                                                       SCEVExpander &Expander,
-                                                       IRBuilder<> &Builder) {
-  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
-  DEBUG(ICI->dump());
-
+Optional<LoopPredication::LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst *ICI) {
   ICmpInst::Predicate Pred = ICI->getPredicate();
+
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
   const SCEV *LHSS = SE->getSCEV(LHS);
@@ -135,17 +147,54 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   if (isa<SCEVCouldNotCompute>(RHSS))
     return None;
 
-  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable index
+  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
   if (SE->isLoopInvariant(LHSS, L)) {
     std::swap(LHS, RHS);
     std::swap(LHSS, RHSS);
     Pred = ICmpInst::getSwappedPredicate(Pred);
   }
-  if (!SE->isLoopInvariant(RHSS, L) || !isSafeToExpand(RHSS, *SE))
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+  if (!AR || AR->getLoop() != L)
     return None;
 
-  const SCEVAddRecExpr *IndexAR = dyn_cast<SCEVAddRecExpr>(LHSS);
-  if (!IndexAR || IndexAR->getLoop() != L)
+  return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+                                    IRBuilder<> &Builder,
+                                    ICmpInst::Predicate Pred, const SCEV *LHS,
+                                    const SCEV *RHS, Instruction *InsertAt) {
+  Type *Ty = LHS->getType();
+  assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+  Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
+  Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
+  return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       IRBuilder<> &Builder) {
+  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  DEBUG(ICI->dump());
+
+  auto RangeCheck = parseLoopICmp(ICI);
+  if (!RangeCheck) {
+    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = RangeCheck->Pred;
+  const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
+  const SCEV *RHSS = RangeCheck->Limit;
+
+  auto CanExpand = [this](const SCEV *S) {
+    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+  };
+  if (!CanExpand(RHSS))
     return None;
 
   DEBUG(dbgs() << "IndexAR: ");
@@ -170,17 +219,13 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   DEBUG(dbgs() << "NewLHSS: ");
   DEBUG(NewLHSS->dump());
 
-  if (!SE->isLoopInvariant(NewLHSS, L) || !isSafeToExpand(NewLHSS, *SE))
+  if (!CanExpand(NewLHSS))
     return None;
 
   DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
 
-  Type *Ty = LHS->getType();
   Instruction *InsertAt = Preheader->getTerminator();
-  assert(Ty == RHS->getType() && "icmp operands have different types?");
-  Value *NewLHS = Expander.expandCodeFor(NewLHSS, Ty, InsertAt);
-  Value *NewRHS = Expander.expandCodeFor(RHSS, Ty, InsertAt);
-  return Builder.CreateICmp(Pred, NewLHS, NewRHS);
+  return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
 }
 
 bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
@@ -272,6 +317,9 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
         if (II->getIntrinsicID() == Intrinsic::experimental_guard)
           Guards.push_back(II);
 
+  if (Guards.empty())
+    return false;
+
   SCEVExpander Expander(*SE, *DL, "loop-predication");
 
   bool Changed = false;
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 2ba9265566a8..7312d97f8efe 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -347,7 +347,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // in the map.
       ValueMap[Inst] = V;
       if (!C->mayHaveSideEffects()) {
-        delete C;
+        C->deleteValue();
         C = nullptr;
       }
     } else {
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index bd1f21c69eba..28d94497a3ef 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3805,6 +3805,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
         continue;
 
+      F.canonicalize(*L);
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 0e7572f8d2e5..5cfbf6baeaa9 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -30,9 +30,19 @@
 /// tracks what operations have a given value number (IE it also tracks the
 /// reverse mapping from value number -> operations with that value number), so
 /// that it only needs to reprocess the instructions that are affected when
-/// something's value number changes.  The rest of the algorithm is devoted to
-/// performing symbolic evaluation, forward propagation, and simplification of
-/// operations based on the value numbers deduced so far.
+/// something's value number changes.  The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen.  The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai.  The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
 ///
 /// We also do not perform elimination by using any published algorithm.  All
 /// published algorithms are O(Instructions). Instead, we use a technique that
@@ -51,7 +61,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -104,12 +113,14 @@ STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
 STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
 STATISTIC(NumGVNAvoidedSortedLeaderChanges,
           "Number of avoided sorted leader changes");
-STATISTIC(NumGVNNotMostDominatingLeader,
-          "Number of times a member dominated it's new classes' leader");
 STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+          "Number of things eliminated using PHI of ops");
 DEBUG_COUNTER(VNCounter, "newgvn-vn",
               "Controls which instructions are value numbered")
-
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+              "Controls which instructions we create phi of ops for")
 // Currently store defining access refinement is too slow due to basicaa being
 // egregiously slow.  This flag lets us keep it working while we work on this
 // issue.
@@ -172,10 +183,9 @@ private:
       }
     }
     // See if we really were the root of a component, by seeing if we still have
-    // our DFSNumber.
-    // If we do, we are the root of the component, and we have completed a
-    // component. If we do not,
-    // we are not the root of a component, and belong on the component stack.
+    // our DFSNumber.  If we do, we are the root of the component, and we have
+    // completed a component. If we do not, we are not the root of a component,
+    // and belong on the component stack.
     if (Root.lookup(I) == OurDFS) {
       unsigned ComponentID = Components.size();
       Components.resize(Components.size() + 1);
@@ -367,6 +377,7 @@ private:
   int StoreCount = 0;
 };
 
+struct HashedExpression;
 namespace llvm {
 template <> struct DenseMapInfo<const Expression *> {
   static const Expression *getEmptyKey() {
@@ -379,9 +390,11 @@ template <> struct DenseMapInfo<const Expression *> {
     Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
     return reinterpret_cast<const Expression *>(Val);
   }
-  static unsigned getHashValue(const Expression *V) {
-    return static_cast<unsigned>(V->getHashValue());
+  static unsigned getHashValue(const Expression *E) {
+    return static_cast<unsigned>(E->getHashValue());
   }
+  static unsigned getHashValue(const HashedExpression &HE);
+  static bool isEqual(const HashedExpression &LHS, const Expression *RHS);
   static bool isEqual(const Expression *LHS, const Expression *RHS) {
     if (LHS == RHS)
       return true;
@@ -393,6 +406,26 @@ template <> struct DenseMapInfo<const Expression *> {
 };
 } // end namespace llvm
 
+// This is just a wrapper around Expression that computes the hash value once at
+// creation time.  Hash values for an Expression can't change once they are
+// inserted into the DenseMap (it breaks DenseMap), so they must be immutable at
+// that point anyway.
+struct HashedExpression {
+  const Expression *E;
+  unsigned HashVal;
+  HashedExpression(const Expression *E)
+      : E(E), HashVal(DenseMapInfo<const Expression *>::getHashValue(E)) {}
+};
+
+unsigned
+DenseMapInfo<const Expression *>::getHashValue(const HashedExpression &HE) {
+  return HE.HashVal;
+}
+bool DenseMapInfo<const Expression *>::isEqual(const HashedExpression &LHS,
+                                               const Expression *RHS) {
+  return isEqual(LHS.E, RHS);
+}
+
 namespace {
 class NewGVN {
   Function &F;
@@ -430,6 +463,33 @@ class NewGVN {
   // Value Mappings.
   DenseMap<Value *, CongruenceClass *> ValueToClass;
   DenseMap<Value *, const Expression *> ValueToExpression;
+  // Value PHI handling, used to make equivalence between phi(op, op) and
+  // op(phi, phi).
+  // These mappings just store various data that would normally be part of the
+  // IR.
+  DenseSet<const Instruction *> PHINodeUses;
+  // Map a temporary instruction we created to a parent block.
+  DenseMap<const Value *, BasicBlock *> TempToBlock;
+  // Map between the temporary phis we created and the real instructions they
+  // are known equivalent to.
+  DenseMap<const Value *, PHINode *> RealToTemp;
+  // In order to know when we should re-process instructions that have
+  // phi-of-ops, we track the set of expressions that they needed as
+  // leaders. When we discover new leaders for those expressions, we process the
+  // associated phi-of-op instructions again in case they have changed.  The
+  // other way they may change is if they had leaders, and those leaders
+  // disappear.  However, at the point they have leaders, there are uses of the
+  // relevant operands in the created phi node, and so they will get reprocessed
+  // through the normal user marking we perform.
+  mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+  DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+      ExpressionToPhiOfOps;
+  // Map from basic block to the temporary operations we created
+  DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs;
+  // Map from temporary operation to MemoryAccess.
+  DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+  // Set of all temporary instructions we created.
+  DenseSet<Instruction *> AllTempInstructions;
 
   // Mapping from predicate info we used to the instructions we used it with.
   // In order to correctly ensure propagation, we must keep track of what
@@ -462,12 +522,19 @@ class NewGVN {
   enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
   DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
 
-  enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
-  mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
+  enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+  mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
 
+  // We have a single expression that represents currently DeadExpressions.
+  // For dead expressions we can prove will stay dead, we mark them with
+  // DFS number zero.  However, it's possible in the case of phi nodes
+  // for us to assume/prove all arguments are dead during fixpointing.
+  // We use DeadExpression for that case.
+  DeadExpression *SingletonDeadExpression = nullptr;
+
   // Which values have changed as a result of leader changes.
   SmallPtrSet<Value *, 8> LeaderChanges;
 
@@ -521,7 +588,8 @@ private:
   const Expression *createBinaryExpression(unsigned, Type *, Value *,
                                            Value *) const;
   PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
-                                     bool &AllConstant) const;
+                                     bool &OriginalOpsConstant) const;
+  const DeadExpression *createDeadExpression() const;
   const VariableExpression *createVariableExpression(Value *) const;
   const ConstantExpression *createConstantExpression(Constant *) const;
   const Expression *createVariableOrConstant(Value *V) const;
@@ -562,6 +630,9 @@ private:
     return CClass;
   }
   void initializeCongruenceClasses(Function &F);
+  const Expression *makePossiblePhiOfOps(Instruction *, bool,
+                                         SmallPtrSetImpl<Value *> &);
+  void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
 
   // Value number an Instruction or MemoryPhi.
   void valueNumberMemoryPhi(MemoryPhi *);
@@ -570,7 +641,8 @@ private:
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
                                                Value *) const;
-  const Expression *performSymbolicEvaluation(Value *) const;
+  const Expression *performSymbolicEvaluation(Value *,
+                                              SmallPtrSetImpl<Value *> &) const;
   const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
                                                 Instruction *,
                                                 MemoryAccess *) const;
@@ -595,7 +667,7 @@ private:
   bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
   CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
   const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
-  bool isMemoryAccessTop(const MemoryAccess *) const;
+  bool isMemoryAccessTOP(const MemoryAccess *) const;
 
   // Ranking
   unsigned int getRank(const Value *) const;
@@ -619,19 +691,26 @@ private:
   void replaceInstruction(Instruction *, Value *);
   void markInstructionForDeletion(Instruction *);
   void deleteInstructionsInBlock(BasicBlock *);
+  Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const;
 
   // New instruction creation.
   void handleNewInstruction(Instruction *){};
 
   // Various instruction touch utilities
+  template <typename Map, typename KeyType, typename Func>
+  void for_each_found(Map &, const KeyType &, Func);
+  template <typename Map, typename KeyType>
+  void touchAndErase(Map &, const KeyType &);
   void markUsersTouched(Value *);
   void markMemoryUsersTouched(const MemoryAccess *);
   void markMemoryDefTouched(const MemoryAccess *);
   void markPredicateUsersTouched(Instruction *);
   void markValueLeaderChangeTouched(CongruenceClass *CC);
   void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void markPhiOfOpsChanged(const HashedExpression &HE);
   void addPredicateUsers(const PredicateBase *, Instruction *) const;
   void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+  void addAdditionalUsers(Value *To, Value *User) const;
 
   // Main loop of value numbering
   void iterateTouchedInstructions();
@@ -639,13 +718,18 @@ private:
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
-  void updateProcessedCount(Value *V);
+  void updateProcessedCount(const Value *V);
   void verifyMemoryCongruency() const;
   void verifyIterationSettled(Function &F);
   void verifyStoreExpressions() const;
-  bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
+  bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+                              const MemoryAccess *, const MemoryAccess *) const;
   BasicBlock *getBlockForValue(Value *V) const;
   void deleteExpression(const Expression *E) const;
+  MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+  MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
+  MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
   unsigned InstrToDFSNum(const Value *V) const {
     assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
     return InstrDFS.lookup(V);
@@ -665,8 +749,8 @@ private:
                ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
                : InstrDFS.lookup(MA);
   }
-  bool isCycleFree(const PHINode *PN) const;
-  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+  bool isCycleFree(const Instruction *) const;
+  bool isBackedge(BasicBlock *From, BasicBlock *To) const;
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
   std::pair<int, int> StartingVNCounter;
@@ -694,20 +778,46 @@ bool StoreExpression::equals(const Expression &Other) const {
   return true;
 }
 
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+  if (From == To)
+    return true;
+  auto *FromDTN = DT->getNode(From);
+  auto *ToDTN = DT->getNode(To);
+  return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN);
+}
+
 #ifndef NDEBUG
 static std::string getBlockName(const BasicBlock *B) {
   return DOTGraphTraits<const Function *>::getSimpleNodeLabel(B, nullptr);
 }
 #endif
 
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+  auto *Result = MSSA->getMemoryAccess(I);
+  return Result ? Result : TempToMemory.lookup(I);
+}
+
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+  return MSSA->getMemoryAccess(BB);
+}
+
 // Get the basic block from an instruction/memory value.
 BasicBlock *NewGVN::getBlockForValue(Value *V) const {
-  if (auto *I = dyn_cast<Instruction>(V))
-    return I->getParent();
-  else if (auto *MP = dyn_cast<MemoryPhi>(V))
-    return MP->getBlock();
-  llvm_unreachable("Should have been able to figure out a block for our value");
-  return nullptr;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    auto *Parent = I->getParent();
+    if (Parent)
+      return Parent;
+    Parent = TempToBlock.lookup(V);
+    assert(Parent && "Every fake instruction should have a block");
+    return Parent;
+  }
+
+  auto *MP = dyn_cast<MemoryPhi>(V);
+  assert(MP && "Should have been an instruction or a MemoryPhi");
+  return MP->getBlock();
 }
 
 // Delete a definitely dead expression, so it can be reused by the expression
@@ -719,10 +829,9 @@ void NewGVN::deleteExpression(const Expression *E) const {
   const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
   ExpressionAllocator.Deallocate(E);
 }
-
 PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
-                                           bool &AllConstant) const {
-  BasicBlock *PHIBlock = I->getParent();
+                                           bool &OriginalOpsConstant) const {
+  BasicBlock *PHIBlock = getBlockForValue(I);
   auto *PN = cast<PHINode>(I);
   auto *E =
       new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
@@ -731,8 +840,6 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
   E->setType(I->getType());
   E->setOpcode(I->getOpcode());
 
-  unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
-
   // NewGVN assumes the operands of a PHI node are in a consistent order across
   // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
   // this in LLVM at some point we don't want GVN to find wrong congruences.
@@ -753,18 +860,20 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
   auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
     return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
   });
-
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
                  [&](const Use *U) -> Value * {
                    auto *BB = PN->getIncomingBlock(*U);
-                   auto *DTN = DT->getNode(BB);
-                   if (RPOOrdering.lookup(DTN) >= PHIRPO)
-                     HasBackedge = true;
-                   AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);
-
-                   // Don't try to transform self-defined phis.
+                   HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+                   OriginalOpsConstant =
+                       OriginalOpsConstant && isa<Constant>(*U);
+                   // Use nullptr to distinguish between things that were
+                   // originally self-defined and those that have an operand
+                   // leader that is self-defined.
                    if (*U == PN)
-                     return PN;
+                     return nullptr;
+                   // Things in TOPClass are equivalent to everything.
+                   if (ValueToClass.lookup(*U) == TOPClass)
+                     return nullptr;
                    return lookupOperandLeader(*U);
                  });
   return E;
@@ -785,7 +894,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
   // whether all members are constant.
   std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
     auto Operand = lookupOperandLeader(O);
-    AllConstant &= isa<Constant>(Operand);
+    AllConstant = AllConstant && isa<Constant>(Operand);
     return Operand;
   });
 
@@ -848,7 +957,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   if (CC && CC->getDefiningExpr()) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
-                   << " expression " << *V << "\n");
+                   << " expression " << *CC->getDefiningExpr() << "\n");
     NumGVNOpsSimplified++;
     deleteExpression(E);
     return CC->getDefiningExpr();
@@ -961,6 +1070,12 @@ NewGVN::createAggregateValueExpression(Instruction *I) const {
   llvm_unreachable("Unhandled type of aggregate value operation");
 }
 
+const DeadExpression *NewGVN::createDeadExpression() const {
+  // DeadExpression has no arguments and all DeadExpression's are the same,
+  // so we only need one of them.
+  return SingletonDeadExpression;
+}
+
 const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
   auto *E = new (ExpressionAllocator) VariableExpression(V);
   E->setOpcode(V->getValueID());
@@ -1032,7 +1147,7 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
 Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
   if (CC) {
-    // Everything in TOP is represneted by undef, as it can be any value.
+    // Everything in TOP is represented by undef, as it can be any value.
     // We do have to make sure we get the type right though, so we can't set the
     // RepLeader to undef.
     if (CC == TOPClass)
@@ -1054,7 +1169,7 @@ const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
 // Return true if the MemoryAccess is really equivalent to everything. This is
 // equivalent to the lattice value "TOP" in most lattices.  This is the initial
 // state of all MemoryAccesses.
-bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
   return getMemoryClass(MA) == TOPClass;
 }
 
@@ -1100,7 +1215,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
-  auto *StoreAccess = MSSA->getMemoryAccess(SI);
+  auto *StoreAccess = getMemoryAccess(SI);
   // Get the expression, if any, for the RHS of the MemoryDef.
   const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
   if (EnableStoreRefinement)
@@ -1108,7 +1223,6 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   // If we bypassed the use-def chains, make sure we add a use.
   if (StoreRHS != StoreAccess->getDefiningAccess())
     addMemoryUsers(StoreRHS, StoreAccess);
-
   StoreRHS = lookupMemoryLeader(StoreRHS);
   // If we are defined by ourselves, use the live on entry def.
   if (StoreRHS == StoreAccess)
@@ -1137,9 +1251,9 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
             dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) {
       if ((lookupOperandLeader(LI->getPointerOperand()) ==
            lookupOperandLeader(SI->getPointerOperand())) &&
-          (lookupMemoryLeader(MSSA->getMemoryAccess(LI)->getDefiningAccess()) ==
+          (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
            StoreRHS))
-        return createVariableExpression(LI);
+        return createStoreExpression(SI, StoreRHS);
     }
   }
 
@@ -1241,8 +1355,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   // Load of undef is undef.
   if (isa<UndefValue>(LoadAddressLeader))
     return createConstantExpression(UndefValue::get(LI->getType()));
-
-  MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(I);
+  MemoryAccess *OriginalAccess = getMemoryAccess(I);
+  MemoryAccess *DefiningAccess =
+      MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
 
   if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
     if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
@@ -1331,6 +1446,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
     // operands are equal, because assumes must always be true.
     if (CmpInst::isTrueWhenEqual(Predicate)) {
       addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
       return createVariableOrConstant(FirstOp);
     }
   }
@@ -1343,6 +1459,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
     if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
         (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
       addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
       return createVariableOrConstant(FirstOp);
     }
     // Handle the special case of floating point.
@@ -1350,6 +1467,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
          (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
         isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
       addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
       return createConstantExpression(cast<Constant>(FirstOp));
     }
   }
@@ -1430,34 +1548,33 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
   return Changed;
 }
 
-// Determine if a phi is cycle-free.  That means the values in the phi don't
-// depend on any expressions that can change value as a result of the phi.
-// For example, a non-cycle free phi would be  v = phi(0, v+1).
-bool NewGVN::isCycleFree(const PHINode *PN) const {
-  // In order to compute cycle-freeness, we do SCC finding on the phi, and see
-  // what kind of SCC it ends up in.  If it is a singleton, it is cycle-free.
-  // If it is not in a singleton, it is only cycle free if the other members are
-  // all phi nodes (as they do not compute anything, they are copies).  TODO:
-  // There are likely a few other intrinsics or expressions that could be
-  // included here, but this happens so infrequently already that it is not
-  // likely to be worth it.
-  auto PCS = PhiCycleState.lookup(PN);
-  if (PCS == PCS_Unknown) {
-    SCCFinder.Start(PN);
-    auto &SCC = SCCFinder.getComponentFor(PN);
+// Determine if a instruction is cycle-free.  That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction.  For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+  // In order to compute cycle-freeness, we do SCC finding on the instruction,
+  // and see what kind of SCC it ends up in.  If it is a singleton, it is
+  // cycle-free.  If it is not in a singleton, it is only cycle free if the
+  // other members are all phi nodes (as they do not compute anything, they are
+  // copies).
+  auto ICS = InstCycleState.lookup(I);
+  if (ICS == ICS_Unknown) {
+    SCCFinder.Start(I);
+    auto &SCC = SCCFinder.getComponentFor(I);
     // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
     if (SCC.size() == 1)
-      PhiCycleState.insert({PN, PCS_CycleFree});
+      InstCycleState.insert({I, ICS_CycleFree});
     else {
       bool AllPhis =
           llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
-      PCS = AllPhis ? PCS_CycleFree : PCS_Cycle;
+      ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
       for (auto *Member : SCC)
         if (auto *MemberPhi = dyn_cast<PHINode>(Member))
-          PhiCycleState.insert({MemberPhi, PCS});
+          InstCycleState.insert({MemberPhi, ICS});
     }
   }
-  if (PCS == PCS_Cycle)
+  if (ICS == ICS_Cycle)
     return false;
   return true;
 }
@@ -1467,10 +1584,9 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
   // True if one of the incoming phi edges is a backedge.
   bool HasBackedge = false;
   // All constant tracks the state of whether all the *original* phi operands
-  // were constant. This is really shorthand for "this phi cannot cycle due
-  // to forward propagation", as any change in value of the phi is guaranteed
-  // not to later change the value of the phi.
-  // IE it can't be v = phi(undef, v+1)
+  // This is really shorthand for "this phi cannot cycle due to forward
+  // change in value of the phi is guaranteed not to later change the value of
+  // the phi. IE it can't be v = phi(undef, v+1)
   bool AllConstant = true;
   auto *E =
       cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
@@ -1478,8 +1594,16 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
   // See if all arguments are the same.
   // We track if any were undef because they need special handling.
   bool HasUndef = false;
-  auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
-    if (Arg == I)
+  bool CycleFree = isCycleFree(I);
+  auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+    if (Arg == nullptr)
+      return false;
+    // Original self-operands are already eliminated during expression creation.
+    // We can only eliminate value-wise self-operands if it's cycle
+    // free. Otherwise, eliminating the operand can cause our value to change,
+    // which can cause us to not eliminate the operand, which changes the value
+    // back to what it was before, cycling forever.
+    if (CycleFree && Arg == I)
       return false;
     if (isa<UndefValue>(Arg)) {
       HasUndef = true;
@@ -1487,20 +1611,19 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
     }
     return true;
   });
-  // If we are left with no operands, it's undef
+  // If we are left with no operands, it's dead.
   if (Filtered.begin() == Filtered.end()) {
-    DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
-                 << "\n");
+    DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
     deleteExpression(E);
-    return createConstantExpression(UndefValue::get(I->getType()));
+    return createDeadExpression();
   }
   unsigned NumOps = 0;
   Value *AllSameValue = *(Filtered.begin());
   ++Filtered.begin();
   // Can't use std::equal here, sadly, because filter.begin moves.
-  if (llvm::all_of(Filtered, [AllSameValue, &NumOps](const Value *V) {
+  if (llvm::all_of(Filtered, [&](Value *Arg) {
         ++NumOps;
-        return V == AllSameValue;
+        return Arg == AllSameValue;
       })) {
     // In LLVM's non-standard representation of phi nodes, it's possible to have
     // phi nodes with cycles (IE dependent on other phis that are .... dependent
@@ -1519,7 +1642,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
       // constants, or all operands are ignored but the undef, it also must be
       // cycle free.
       if (!AllConstant && HasBackedge && NumOps > 0 &&
-          !isa<UndefValue>(AllSameValue) && !isCycleFree(cast<PHINode>(I)))
+          !isa<UndefValue>(AllSameValue) && !CycleFree)
         return E;
 
       // Only have to check for instructions
@@ -1690,8 +1813,18 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
   return createExpression(I);
 }
 
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function.  We don't do globals here
+// because they are often worse to put in place.
+// TODO: Separate cost from availability
+static bool alwaysAvailable(Value *V) {
+  return isa<Constant>(V) || isa<Argument>(V);
+}
+
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+                                  SmallPtrSetImpl<Value *> &Visited) const {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -1769,12 +1902,39 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
   return E;
 }
 
+// Look up a container in a map, and then call a function for each thing in the
+// found container.
+template <typename Map, typename KeyType, typename Func>
+void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end())
+    for (typename Map::mapped_type::value_type Mapped : Result->second)
+      F(Mapped);
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container.  Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end()) {
+    for (const typename Map::mapped_type::value_type Mapped : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(Mapped));
+    M.erase(Result);
+  }
+}
+
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+  AdditionalUsers[To].insert(User);
+}
+
 void NewGVN::markUsersTouched(Value *V) {
   // Now mark the users as touched.
   for (auto *User : V->users()) {
     assert(isa<Instruction>(User) && "Use of value not within an instruction?");
     TouchedInstructions.set(InstrToDFSNum(User));
   }
+  touchAndErase(AdditionalUsers, V);
 }
 
 void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
@@ -1791,16 +1951,15 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
     return;
   for (auto U : MA->users())
     TouchedInstructions.set(MemoryToDFSNum(U));
-  const auto Result = MemoryToUsers.find(MA);
-  if (Result != MemoryToUsers.end()) {
-    for (auto *User : Result->second)
-      TouchedInstructions.set(MemoryToDFSNum(User));
-    MemoryToUsers.erase(Result);
-  }
+  touchAndErase(MemoryToUsers, MA);
 }
 
 // Add I to the set of users of a given predicate.
 void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+  // Don't add temporary instructions to the user lists.
+  if (AllTempInstructions.count(I))
+    return;
+
   if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
     PredicateToUsers[PBranch->Condition].insert(I);
   else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
@@ -1809,12 +1968,7 @@ void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
 
 // Touch all the predicates that depend on this instruction.
 void NewGVN::markPredicateUsersTouched(Instruction *I) {
-  const auto Result = PredicateToUsers.find(I);
-  if (Result != PredicateToUsers.end()) {
-    for (auto *User : Result->second)
-      TouchedInstructions.set(InstrToDFSNum(User));
-    PredicateToUsers.erase(Result);
-  }
+  touchAndErase(PredicateToUsers, I);
 }
 
 // Mark users affected by a memory leader change.
@@ -1856,11 +2010,11 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
   assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
   if (CC->getStoreCount() > 0) {
     if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
-      return MSSA->getMemoryAccess(NL);
+      return getMemoryAccess(NL);
     // Find the store with the minimum DFS number.
     auto *V = getMinDFSOfRange<Value>(make_filter_range(
         *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
-    return MSSA->getMemoryAccess(cast<StoreInst>(V));
+    return getMemoryAccess(cast<StoreInst>(V));
   }
   assert(CC->getStoreCount() == 0);
 
@@ -1945,31 +2099,11 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
   if (I == OldClass->getNextLeader().first)
     OldClass->resetNextLeader();
 
-  // It's possible, though unlikely, for us to discover equivalences such
-  // that the current leader does not dominate the old one.
-  // This statistic tracks how often this happens.
-  // We assert on phi nodes when this happens, currently, for debugging, because
-  // we want to make sure we name phi node cycles properly.
-  if (isa<Instruction>(NewClass->getLeader()) && NewClass->getLeader() &&
-      I != NewClass->getLeader()) {
-    auto *IBB = I->getParent();
-    auto *NCBB = cast<Instruction>(NewClass->getLeader())->getParent();
-    bool Dominated =
-        IBB == NCBB && InstrToDFSNum(I) < InstrToDFSNum(NewClass->getLeader());
-    Dominated = Dominated || DT->properlyDominates(IBB, NCBB);
-    if (Dominated) {
-      ++NumGVNNotMostDominatingLeader;
-      assert(
-          !isa<PHINode>(I) &&
-          "New class for instruction should not be dominated by instruction");
-    }
-  }
+  OldClass->erase(I);
+  NewClass->insert(I);
 
   if (NewClass->getLeader() != I)
     NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
-
-  OldClass->erase(I);
-  NewClass->insert(I);
   // Handle our special casing of stores.
   if (auto *SI = dyn_cast<StoreInst>(I)) {
     OldClass->decStoreCount();
@@ -1984,7 +2118,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
       // If it's a store expression we are using, it means we are not equivalent
       // to something earlier.
       if (auto *SE = dyn_cast<StoreExpression>(E)) {
-        assert(SE->getStoredValue() != NewClass->getLeader());
         NewClass->setStoredValue(SE->getStoredValue());
         markValueLeaderChangeTouched(NewClass);
         // Shift the new class leader to be the store
@@ -2003,7 +2136,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
   // instructions before.
 
   // If it's not a memory use, set the MemoryAccess equivalence
-  auto *InstMA = dyn_cast_or_null<MemoryDef>(MSSA->getMemoryAccess(I));
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
   if (InstMA)
     moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
   ValueToClass[I] = NewClass;
@@ -2035,21 +2168,31 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
   }
 }
 
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const HashedExpression &HE) {
+  touchAndErase(ExpressionToPhiOfOps, HE);
+}
+
 // Perform congruence finding on a given value numbering expression.
 void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   // This is guaranteed to return something, since it will at least find
   // TOP.
 
-  CongruenceClass *IClass = ValueToClass[I];
+  CongruenceClass *IClass = ValueToClass.lookup(I);
   assert(IClass && "Should have found a IClass");
   // Dead classes should have been eliminated from the mapping.
   assert(!IClass->isDead() && "Found a dead class");
 
-  CongruenceClass *EClass;
+  CongruenceClass *EClass = nullptr;
+  HashedExpression HE(E);
   if (const auto *VE = dyn_cast<VariableExpression>(E)) {
-    EClass = ValueToClass[VE->getVariableValue()];
-  } else {
-    auto lookupResult = ExpressionToClass.insert({E, nullptr});
+    EClass = ValueToClass.lookup(VE->getVariableValue());
+  } else if (isa<DeadExpression>(E)) {
+    EClass = TOPClass;
+  }
+  if (!EClass) {
+    auto lookupResult = ExpressionToClass.insert_as({E, nullptr}, HE);
 
     // If it's not in the value table, create a new congruence class.
     if (lookupResult.second) {
@@ -2098,10 +2241,13 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   if (ClassChanged || LeaderChanged) {
     DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
                  << "\n");
-    if (ClassChanged)
+    if (ClassChanged) {
       moveValueToNewCongruenceClass(I, E, IClass, EClass);
+      markPhiOfOpsChanged(HE);
+    }
+
     markUsersTouched(I);
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
+    if (MemoryAccess *MA = getMemoryAccess(I))
       markMemoryUsersTouched(MA);
     if (auto *CI = dyn_cast<CmpInst>(I))
       markPredicateUsersTouched(CI);
@@ -2110,11 +2256,11 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   // old store expression.  In particular, loads do not compare against stored
   // value, so they will find old store expressions (and associated class
   // mappings) if we leave them in the table.
-  if (ClassChanged && isa<StoreExpression>(E)) {
+  if (ClassChanged && isa<StoreInst>(I)) {
     auto *OldE = ValueToExpression.lookup(I);
     // It could just be that the old class died. We don't want to erase it if we
     // just moved classes.
-    if (OldE && isa<StoreExpression>(OldE) && !OldE->equals(*E))
+    if (OldE && isa<StoreExpression>(OldE) && *E != *OldE)
       ExpressionToClass.erase(OldE);
   }
   ValueToExpression[I] = E;
@@ -2139,7 +2285,7 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
       // impact predicates. Otherwise, only mark the phi nodes as touched, as
       // they are the only thing that depend on new edges. Anything using their
       // values will get propagated to if necessary.
-      if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
+      if (MemoryAccess *MemPhi = getMemoryAccess(To))
         TouchedInstructions.set(InstrToDFSNum(MemPhi));
 
       auto BI = To->begin();
@@ -2147,6 +2293,9 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
         TouchedInstructions.set(InstrToDFSNum(&*BI));
         ++BI;
       }
+      for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) {
+        TouchedInstructions.set(InstrToDFSNum(I));
+      });
     }
   }
 }
@@ -2236,7 +2385,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     // This also may be a memory defining terminator, in which case, set it
     // equivalent only to itself.
     //
-    auto *MA = MSSA->getMemoryAccess(TI);
+    auto *MA = getMemoryAccess(TI);
     if (MA && !isa<MemoryUse>(MA)) {
       auto *CC = ensureLeaderOfMemoryClass(MA);
       if (setMemoryClass(MA, CC))
@@ -2245,6 +2394,158 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
   }
 }
 
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+                         Instruction *ExistingValue) {
+  InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+  AllTempInstructions.insert(Op);
+  PHIOfOpsPHIs[BB].push_back(Op);
+  TempToBlock[Op] = BB;
+  if (ExistingValue)
+    RealToTemp[ExistingValue] = Op;
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+  return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+         isa<LoadInst>(I);
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge,
+                             SmallPtrSetImpl<Value *> &Visited) {
+  if (!okayForPHIOfOps(I))
+    return nullptr;
+
+  if (!Visited.insert(I).second)
+    return nullptr;
+  // For now, we require the instruction be cycle free because we don't
+  // *always* create a phi of ops for instructions that could be done as phi
+  // of ops, we only do it if we think it is useful.  If we did do it all the
+  // time, we could remove the cycle free check.
+  if (!isCycleFree(I))
+    return nullptr;
+
+  unsigned IDFSNum = InstrToDFSNum(I);
+  // Pretty much all of the instructions we can convert to phi of ops over a
+  // backedge that are adds, are really induction variables, and those are
+  // pretty much pointless to convert.  This is very coarse-grained for a
+  // test, so if we do find some value, we can change it later.
+  // But otherwise, what can happen is we convert the induction variable from
+  //
+  // i = phi (0, tmp)
+  // tmp = i + 1
+  //
+  // to
+  // i = phi (0, tmpphi)
+  // tmpphi = phi(1, tmpphi+1)
+  //
+  // Which we don't want to happen.  We could just avoid this for all non-cycle
+  // free phis, and we made go that route.
+  if (HasBackedge && I->getOpcode() == Instruction::Add)
+    return nullptr;
+
+  SmallPtrSet<const Value *, 8> ProcessedPHIs;
+  // TODO: We don't do phi translation on memory accesses because it's
+  // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+  // which we don't have a good way of doing ATM.
+  auto *MemAccess = getMemoryAccess(I);
+  // If the memory operation is defined by a memory operation this block that
+  // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+  // can't help, as it would still be killed by that memory operation.
+  if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+      MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+    return nullptr;
+
+  // Convert op of phis to phi of ops
+  for (auto &Op : I->operands()) {
+    if (!isa<PHINode>(Op))
+      continue;
+    auto *OpPHI = cast<PHINode>(Op);
+    // No point in doing this for one-operand phis.
+    if (OpPHI->getNumOperands() == 1)
+      continue;
+    if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+      return nullptr;
+    SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops;
+    auto *PHIBlock = getBlockForValue(OpPHI);
+    for (auto PredBB : OpPHI->blocks()) {
+      Value *FoundVal = nullptr;
+      // We could just skip unreachable edges entirely but it's tricky to do
+      // with rewriting existing phi nodes.
+      if (ReachableEdges.count({PredBB, PHIBlock})) {
+        // Clone the instruction, create an expression from it, and see if we
+        // have a leader.
+        Instruction *ValueOp = I->clone();
+        auto Iter = TempToMemory.end();
+        if (MemAccess)
+          Iter = TempToMemory.insert({ValueOp, MemAccess}).first;
+
+        for (auto &Op : ValueOp->operands()) {
+          Op = Op->DoPHITranslation(PHIBlock, PredBB);
+          // When this operand changes, it could change whether there is a
+          // leader for us or not.
+          addAdditionalUsers(Op, I);
+        }
+        // Make sure it's marked as a temporary instruction.
+        AllTempInstructions.insert(ValueOp);
+        // and make sure anything that tries to add it's DFS number is
+        // redirected to the instruction we are making a phi of ops
+        // for.
+        InstrDFS.insert({ValueOp, IDFSNum});
+        const Expression *E = performSymbolicEvaluation(ValueOp, Visited);
+        InstrDFS.erase(ValueOp);
+        AllTempInstructions.erase(ValueOp);
+        ValueOp->deleteValue();
+        if (MemAccess)
+          TempToMemory.erase(Iter);
+        if (!E)
+          return nullptr;
+        FoundVal = findPhiOfOpsLeader(E, PredBB);
+        if (!FoundVal) {
+          ExpressionToPhiOfOps[E].insert(I);
+          return nullptr;
+        }
+        if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+          FoundVal = SI->getValueOperand();
+      } else {
+        DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+                     << getBlockName(PredBB)
+                     << " because the block is unreachable\n");
+        FoundVal = UndefValue::get(I->getType());
+      }
+
+      Ops.push_back({FoundVal, PredBB});
+      DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+                   << getBlockName(PredBB) << "\n");
+    }
+    auto *ValuePHI = RealToTemp.lookup(I);
+    bool NewPHI = false;
+    if (!ValuePHI) {
+      ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands());
+      addPhiOfOps(ValuePHI, PHIBlock, I);
+      NewPHI = true;
+      NumGVNPHIOfOpsCreated++;
+    }
+    if (NewPHI) {
+      for (auto PHIOp : Ops)
+        ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+    } else {
+      unsigned int i = 0;
+      for (auto PHIOp : Ops) {
+        ValuePHI->setIncomingValue(i, PHIOp.first);
+        ValuePHI->setIncomingBlock(i, PHIOp.second);
+        ++i;
+      }
+    }
+
+    DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+                 << "\n");
+    return performSymbolicEvaluation(ValuePHI, Visited);
+  }
+  return nullptr;
+}
+
 // The algorithm initially places the values of the routine in the TOP
 // congruence class. The leader of TOP is the undetermined value `undef`.
 // When the algorithm has finished, values still in TOP are unreachable.
@@ -2287,6 +2588,12 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
           TOPClass->incStoreCount();
       }
     for (auto &I : *BB) {
+      // TODO: Move to helper
+      if (isa<PHINode>(&I))
+        for (auto *U : I.users())
+          if (auto *UInst = dyn_cast<Instruction>(U))
+            if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+              PHINodeUses.insert(UInst);
       // Don't insert void terminators into the class. We don't value number
       // them, and they just end up sitting in TOP.
       if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
@@ -2311,12 +2618,35 @@ void NewGVN::cleanupTables() {
     CongruenceClasses[i] = nullptr;
   }
 
+  // Destroy the value expressions
+  SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+                                         AllTempInstructions.end());
+  AllTempInstructions.clear();
+
+  // We have to drop all references for everything first, so there are no uses
+  // left as we delete them.
+  for (auto *I : TempInst) {
+    I->dropAllReferences();
+  }
+
+  while (!TempInst.empty()) {
+    auto *I = TempInst.back();
+    TempInst.pop_back();
+    I->deleteValue();
+  }
+
   ValueToClass.clear();
   ArgRecycler.clear(ExpressionAllocator);
   ExpressionAllocator.Reset();
   CongruenceClasses.clear();
   ExpressionToClass.clear();
   ValueToExpression.clear();
+  RealToTemp.clear();
+  AdditionalUsers.clear();
+  ExpressionToPhiOfOps.clear();
+  TempToBlock.clear();
+  TempToMemory.clear();
+  PHIOfOpsPHIs.clear();
   ReachableBlocks.clear();
   ReachableEdges.clear();
 #ifndef NDEBUG
@@ -2332,14 +2662,17 @@ void NewGVN::cleanupTables() {
   MemoryToUsers.clear();
 }
 
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
 std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
                                                        unsigned Start) {
   unsigned End = Start;
-  if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(B)) {
+  if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
     InstrDFS[MemPhi] = End++;
     DFSToInstr.emplace_back(MemPhi);
   }
 
+  // Then the real block goes next.
   for (auto &I : *B) {
     // There's no need to call isInstructionTriviallyDead more than once on
     // an instruction. Therefore, once we know that an instruction is dead
@@ -2350,7 +2683,6 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
       markInstructionForDeletion(&I);
       continue;
     }
-
     InstrDFS[&I] = End++;
     DFSToInstr.emplace_back(&I);
   }
@@ -2361,7 +2693,7 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
   return std::make_pair(Start, End);
 }
 
-void NewGVN::updateProcessedCount(Value *V) {
+void NewGVN::updateProcessedCount(const Value *V) {
 #ifndef NDEBUG
   if (ProcessedCount.count(V) == 0) {
     ProcessedCount.insert({V, 1});
@@ -2375,12 +2707,13 @@ void NewGVN::updateProcessedCount(Value *V) {
 // Evaluate MemoryPhi nodes symbolically, just like PHI nodes
 void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // If all the arguments are the same, the MemoryPhi has the same value as the
-  // argument.
-  // Filter out unreachable blocks and self phis from our operands.
+  // argument.  Filter out unreachable blocks and self phis from our operands.
+  // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+  // self-phi checking.
   const BasicBlock *PHIBlock = MP->getBlock();
   auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
-    return lookupMemoryLeader(cast<MemoryAccess>(U)) != MP &&
-           !isMemoryAccessTop(cast<MemoryAccess>(U)) &&
+    return cast<MemoryAccess>(U) != MP &&
+           !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
            ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
   });
   // If all that is left is nothing, our memoryphi is undef. We keep it as
@@ -2433,18 +2766,26 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
   DEBUG(dbgs() << "Processing instruction " << *I << "\n");
   if (!I->isTerminator()) {
     const Expression *Symbolized = nullptr;
+    SmallPtrSet<Value *, 2> Visited;
     if (DebugCounter::shouldExecute(VNCounter)) {
-      Symbolized = performSymbolicEvaluation(I);
+      Symbolized = performSymbolicEvaluation(I, Visited);
+      // Make a phi of ops if necessary
+      if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+          !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+        // FIXME: Backedge argument
+        auto *PHIE = makePossiblePhiOfOps(I, false, Visited);
+        if (PHIE)
+          Symbolized = PHIE;
+      }
+
     } else {
       // Mark the instruction as unused so we don't value number it again.
       InstrDFS[I] = 0;
     }
     // If we couldn't come up with a symbolic expression, use the unknown
     // expression
-    if (Symbolized == nullptr) {
+    if (Symbolized == nullptr)
       Symbolized = createUnknownExpression(I);
-    }
-
     performCongruenceFinding(I, Symbolized);
   } else {
     // Handle terminators that return values. All of them produce values we
@@ -2460,13 +2801,23 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
 
 // Check if there is a path, using single or equal argument phi nodes, from
 // First to Second.
-bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
-                                    const MemoryAccess *Second) const {
+bool NewGVN::singleReachablePHIPath(
+    SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+    const MemoryAccess *Second) const {
   if (First == Second)
     return true;
   if (MSSA->isLiveOnEntryDef(First))
     return false;
 
+  // This is not perfect, but as we're just verifying here, we can live with
+  // the loss of precision. The real solution would be that of doing strongly
+  // connected component finding in this routine, and it's probably not worth
+  // the complexity for the time being. So, we just keep a set of visited
+  // MemoryAccess and return true when we hit a cycle.
+  if (Visited.count(First))
+    return true;
+  Visited.insert(First);
+
   const auto *EndDef = First;
   for (auto *ChainDef : optimized_def_chain(First)) {
     if (ChainDef == Second)
@@ -2489,7 +2840,8 @@ bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
     Okay =
         std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
   if (Okay)
-    return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+    return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+                                  Second);
   return false;
 }
 
@@ -2552,17 +2904,17 @@ void NewGVN::verifyMemoryCongruency() const {
 
   auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
   for (auto KV : Filtered) {
-    assert(KV.second != TOPClass &&
-           "Memory not unreachable but ended up in TOP");
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
       auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
-      if (FirstMUD && SecondMUD)
-        assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
+      if (FirstMUD && SecondMUD) {
+        SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+        assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
                 ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
                     ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
                "The instructions for these memory operations should have "
                "been in the same congruence class or reachable through"
                "a single argument phi");
+      }
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
       // We can only sanely verify that MemoryDefs in the operand list all have
       // the same class.
@@ -2671,7 +3023,7 @@ void NewGVN::iterateTouchedInstructions() {
   // Nothing set, nothing to iterate, just return.
   if (FirstInstr == -1)
     return;
-  BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
   while (TouchedInstructions.any()) {
     ++Iterations;
     // Walk through all the instructions in all the blocks in RPO.
@@ -2688,7 +3040,7 @@ void NewGVN::iterateTouchedInstructions() {
       }
 
       Value *V = InstrFromDFSNum(InstrNum);
-      BasicBlock *CurrBlock = getBlockForValue(V);
+      const BasicBlock *CurrBlock = getBlockForValue(V);
 
       // If we hit a new block, do reachability processing.
       if (CurrBlock != LastBlock) {
@@ -2731,6 +3083,7 @@ bool NewGVN::runGVN() {
   bool Changed = false;
   NumFuncArgs = F.arg_size();
   MSSAWalker = MSSA->getWalker();
+  SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
 
   // Count number of instructions for sizing of hash tables, and come
   // up with a global dfs numbering for instructions.
@@ -2769,6 +3122,7 @@ bool NewGVN::runGVN() {
     BlockInstRange.insert({B, BlockRange});
     ICount += BlockRange.second - BlockRange.first;
   }
+  initializeCongruenceClasses(F);
 
   TouchedInstructions.resize(ICount);
   // Ensure we don't end up resizing the expressionToClass map, as
@@ -2779,9 +3133,10 @@ bool NewGVN::runGVN() {
   // Initialize the touched instructions to include the entry block.
   const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
   TouchedInstructions.set(InstRange.first, InstRange.second);
+  DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+               << " marked reachable\n");
   ReachableBlocks.insert(&F.getEntryBlock());
 
-  initializeCongruenceClasses(F);
   iterateTouchedInstructions();
   verifyMemoryCongruency();
   verifyIterationSettled(F);
@@ -2794,7 +3149,8 @@ bool NewGVN::runGVN() {
     if (!ToErase->use_empty())
       ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
 
-    ToErase->eraseFromParent();
+    if (ToErase->getParent())
+      ToErase->eraseFromParent();
   }
 
   // Delete all unreachable blocks.
@@ -2813,14 +3169,6 @@ bool NewGVN::runGVN() {
   return Changed;
 }
 
-// Return true if V is a value that will always be available (IE can
-// be placed anywhere) in the function.  We don't do globals here
-// because they are often worse to put in place.
-// TODO: Separate cost from availability
-static bool alwaysAvailable(Value *V) {
-  return isa<Constant>(V) || isa<Argument>(V);
-}
-
 struct NewGVN::ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
@@ -2910,9 +3258,21 @@ void NewGVN::convertClassToDFSOrdered(
     }
     assert(isa<Instruction>(D) &&
            "The dense set member should always be an instruction");
-    VDDef.LocalNum = InstrToDFSNum(D);
-    DFSOrderedSet.emplace_back(VDDef);
     Instruction *Def = cast<Instruction>(D);
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.push_back(VDDef);
+    // If there is a phi node equivalent, add it
+    if (auto *PN = RealToTemp.lookup(Def)) {
+      auto *PHIE =
+          dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+      if (PHIE) {
+        VDDef.Def.setInt(false);
+        VDDef.Def.setPointer(PN);
+        VDDef.LocalNum = 0;
+        DFSOrderedSet.push_back(VDDef);
+      }
+    }
+
     unsigned int UseCount = 0;
     // Now add the uses.
     for (auto &U : Def->uses()) {
@@ -2929,7 +3289,7 @@ void NewGVN::convertClassToDFSOrdered(
           // they are from.
           VDUse.LocalNum = InstrDFS.size() + 1;
         } else {
-          IBlock = I->getParent();
+          IBlock = getBlockForValue(I);
           VDUse.LocalNum = InstrToDFSNum(I);
         }
 
@@ -3099,6 +3459,37 @@ private:
 };
 }
 
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
+                                  const BasicBlock *BB) const {
+  // It would already be constant if we could make it constant
+  if (auto *CE = dyn_cast<ConstantExpression>(E))
+    return CE->getConstantValue();
+  if (auto *VE = dyn_cast<VariableExpression>(E))
+    return VE->getVariableValue();
+
+  auto *CC = ExpressionToClass.lookup(E);
+  if (!CC)
+    return nullptr;
+  if (alwaysAvailable(CC->getLeader()))
+    return CC->getLeader();
+
+  for (auto Member : *CC) {
+    auto *MemberInst = dyn_cast<Instruction>(Member);
+    // Anything that isn't an instruction is always available.
+    if (!MemberInst)
+      return Member;
+    // If we are looking for something in the same block as the member, it must
+    // be a leader because this function is looking for operands for a phi node.
+    if (MemberInst->getParent() == BB ||
+        DT->dominates(MemberInst->getParent(), BB)) {
+      return Member;
+    }
+  }
+  return nullptr;
+}
+
 bool NewGVN::eliminateInstructions(Function &F) {
   // This is a non-standard eliminator. The normal way to eliminate is
   // to walk the dominator tree in order, keeping track of available
@@ -3129,25 +3520,42 @@ bool NewGVN::eliminateInstructions(Function &F) {
   // DFS numbers are updated, we compute some ourselves.
   DT->updateDFSNumbers();
 
-  for (auto &B : F) {
-    if (!ReachableBlocks.count(&B)) {
-      for (const auto S : successors(&B)) {
-        for (auto II = S->begin(); isa<PHINode>(II); ++II) {
-          auto &Phi = cast<PHINode>(*II);
-          DEBUG(dbgs() << "Replacing incoming value of " << *II << " for block "
-                       << getBlockName(&B)
-                       << " with undef due to it being unreachable\n");
-          for (auto &Operand : Phi.incoming_values())
-            if (Phi.getIncomingBlock(Operand) == &B)
-              Operand.set(UndefValue::get(Phi.getType()));
-        }
+  // Go through all of our phi nodes, and kill the arguments associated with
+  // unreachable edges.
+  auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) {
+    for (auto &Operand : PHI.incoming_values())
+      if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) {
+        DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
+                     << getBlockName(PHI.getIncomingBlock(Operand))
+                     << " with undef due to it being unreachable\n");
+        Operand.set(UndefValue::get(PHI.getType()));
       }
+  };
+  SmallPtrSet<BasicBlock *, 8> BlocksWithPhis;
+  for (auto &B : F)
+    if ((!B.empty() && isa<PHINode>(*B.begin())) ||
+        (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end()))
+      BlocksWithPhis.insert(&B);
+  DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+  for (auto KV : ReachableEdges)
+    ReachablePredCount[KV.getEnd()]++;
+  for (auto *BB : BlocksWithPhis)
+    // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
+    // the block and subtract the pred count, but it's more complicated.
+    if (ReachablePredCount.lookup(BB) !=
+        std::distance(pred_begin(BB), pred_end(BB))) {
+      for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
+        auto &PHI = cast<PHINode>(*II);
+        ReplaceUnreachablePHIArgs(PHI, BB);
+      }
+      for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) {
+        ReplaceUnreachablePHIArgs(*PHI, BB);
+      });
     }
-  }
 
   // Map to store the use counts
   DenseMap<const Value *, unsigned int> UseCounts;
-  for (CongruenceClass *CC : reverse(CongruenceClasses)) {
+  for (auto *CC : reverse(CongruenceClasses)) {
     // Track the equivalent store info so we can decide whether to try
     // dead store elimination.
     SmallVector<ValueDFS, 8> PossibleDeadStores;
@@ -3156,13 +3564,15 @@ bool NewGVN::eliminateInstructions(Function &F) {
       continue;
     // Everything still in the TOP class is unreachable or dead.
     if (CC == TOPClass) {
-#ifndef NDEBUG
-      for (auto M : *CC)
+      for (auto M : *CC) {
+        auto *VTE = ValueToExpression.lookup(M);
+        if (VTE && isa<DeadExpression>(VTE))
+          markInstructionForDeletion(cast<Instruction>(M));
         assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
                 InstructionsToErase.count(cast<Instruction>(M))) &&
                "Everything in TOP should be unreachable or dead at this "
                "point");
-#endif
+      }
       continue;
     }
 
@@ -3195,7 +3605,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
       DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
                    << "\n");
       // If this is a singleton, we can skip it.
-      if (CC->size() != 1) {
+      if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
         // This is a stack because equality replacement/etc may place
         // constants in the middle of the member list, and we want to use
         // those constant values in preference to the current leader, over
@@ -3217,6 +3627,22 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // We ignore void things because we can't get a value from them.
           if (Def && Def->getType()->isVoidTy())
             continue;
+          auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+          if (DefInst && AllTempInstructions.count(DefInst)) {
+            auto *PN = cast<PHINode>(DefInst);
+
+            // If this is a value phi and that's the expression we used, insert
+            // it into the program
+            // remove from temp instruction list.
+            AllTempInstructions.erase(PN);
+            auto *DefBlock = getBlockForValue(Def);
+            DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+                         << " into block "
+                         << getBlockName(getBlockForValue(Def)) << "\n");
+            PN->insertBefore(&DefBlock->front());
+            Def = PN;
+            NumGVNPHIOfOpsEliminations++;
+          }
 
           if (EliminationStack.empty()) {
             DEBUG(dbgs() << "Elimination Stack is empty\n");
@@ -3301,6 +3727,10 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
           Value *DominatingLeader = EliminationStack.back();
 
+          auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+          if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+            DominatingLeader = II->getOperand(0);
+
           // Don't replace our existing users with ourselves.
           if (U->get() == DominatingLeader)
             continue;
@@ -3321,6 +3751,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // It's about to be alive again.
           if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
             ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          if (LeaderUseCount == 0 && II)
+            ProbablyDead.insert(II);
           ++LeaderUseCount;
           AnythingReplaced = true;
         }
@@ -3375,7 +3807,6 @@ bool NewGVN::eliminateInstructions(Function &F) {
       }
     }
   }
-
   return AnythingReplaced;
 }
 
@@ -3385,19 +3816,23 @@ bool NewGVN::eliminateInstructions(Function &F) {
 // we will simplify an operation with all constants so that it doesn't matter
 // what order they appear in.
 unsigned int NewGVN::getRank(const Value *V) const {
-  // Prefer undef to anything else
+  // Prefer constants to undef to anything else
+  // Undef is a constant, have to check it first.
+  // Prefer smaller constants to constantexprs
+  if (isa<ConstantExpr>(V))
+    return 2;
   if (isa<UndefValue>(V))
-    return 0;
-  if (isa<Constant>(V))
     return 1;
+  if (isa<Constant>(V))
+    return 0;
   else if (auto *A = dyn_cast<Argument>(V))
-    return 2 + A->getArgNo();
+    return 3 + A->getArgNo();
 
   // Need to shift the instruction DFS by number of arguments + 3 to account for
   // the constant and argument ranking above.
   unsigned Result = InstrToDFSNum(V);
   if (Result > 0)
-    return 3 + NumFuncArgs + Result;
+    return 4 + NumFuncArgs + Result;
   // Unreachable or something else, just return a really large number.
   return ~0;
 }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 53320bff0883..a20890b22603 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1582,7 +1582,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     }
 
     // No need for extra uses anymore.
-    delete DummyInst;
+    DummyInst->deleteValue();
 
     unsigned NumAddedValues = NewMulOps.size();
     Value *V = EmitAddTreeOfValues(I, NewMulOps);
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 1d9beffaf06b..24bd0a2b7bdf 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2443,7 +2443,7 @@ private:
                         "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
-      delete Placeholder;
+      Placeholder->deleteValue();
     } else {
       LI.replaceAllUsesWith(V);
     }
@@ -3898,8 +3898,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   }
 
   NumAllocaPartitionUses += NumUses;
-  MaxUsesPerAllocaPartition =
-      std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
+  MaxUsesPerAllocaPartition.updateMax(NumUses);
 
   // Now that we've processed all the slices in the new partition, check if any
   // PHIs or Selects would block promotion.
@@ -4016,8 +4015,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   }
 
   NumAllocaPartitions += NumPartitions;
-  MaxPartitionsPerAlloca =
-      std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+  MaxPartitionsPerAlloca.updateMax(NumPartitions);
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 2be3f5c533b9..8b8d6590aa6a 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -693,7 +693,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
       UnlinkedInst->setOperand(I, nullptr);
       RecursivelyDeleteTriviallyDeadInstructions(Op);
     }
-    delete UnlinkedInst;
+    UnlinkedInst->deleteValue();
   }
   bool Ret = !UnlinkedInstructions.empty();
   UnlinkedInstructions.clear();
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 4aa26fd14fee..bf2ab7c55be2 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -317,7 +317,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
 
         if (!NewInst->mayHaveSideEffects()) {
           VMap[&*II] = V;
-          delete NewInst;
+          NewInst->deleteValue();
           continue;
         }
       }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index b44bc74d6551..27f72fcd8bda 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2235,7 +2235,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
         if (!BBI->use_empty())
           TranslateMap[&*BBI] = V;
         if (!N->mayHaveSideEffects()) {
-          delete N; // Instruction folded away, don't need actual inst
+          N->deleteValue(); // Instruction folded away, don't need actual inst
           N = nullptr;
         }
       } else {
@@ -4380,7 +4380,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   // Gather dead cases.
   SmallVector<ConstantInt *, 8> DeadCases;
   for (auto &Case : SI->cases()) {
-    APInt CaseVal = Case.getCaseValue()->getValue();
+    const APInt &CaseVal = Case.getCaseValue()->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
       DeadCases.push_back(Case.getCaseValue());
@@ -4946,7 +4946,7 @@ SwitchLookupTable::SwitchLookupTable(
         LinearMappingPossible = false;
         break;
       }
-      APInt Val = ConstVal->getValue();
+      const APInt &Val = ConstVal->getValue();
       if (I != 0) {
         APInt Dist = Val - PrevVal;
         if (I == 1) {
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 1de579ed41b0..85c9464b5569 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -426,57 +426,70 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   return Dst;
 }
 
-Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
+                                               unsigned CharSize) {
   Value *Src = CI->getArgOperand(0);
 
   // Constant folding: strlen("xyz") -> 3
-  if (uint64_t Len = GetStringLength(Src))
+  if (uint64_t Len = GetStringLength(Src, CharSize))
     return ConstantInt::get(CI->getType(), Len - 1);
 
   // If s is a constant pointer pointing to a string literal, we can fold
-  // strlen(s + x) to strlen(s) - x, when x is known to be in the range 
+  // strlen(s + x) to strlen(s) - x, when x is known to be in the range
   // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
-  // We only try to simplify strlen when the pointer s points to an array 
+  // We only try to simplify strlen when the pointer s points to an array
   // of i8. Otherwise, we would need to scale the offset x before doing the
-  // subtraction. This will make the optimization more complex, and it's not 
-  // very useful because calling strlen for a pointer of other types is 
+  // subtraction. This will make the optimization more complex, and it's not
+  // very useful because calling strlen for a pointer of other types is
   // very uncommon.
   if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
-    if (!isGEPBasedOnPointerToString(GEP))
+    if (!isGEPBasedOnPointerToString(GEP, CharSize))
       return nullptr;
 
-    StringRef Str;
-    if (getConstantStringInfo(GEP->getOperand(0), Str, 0, false)) {
-      size_t NullTermIdx = Str.find('\0');
-      
-      // If the string does not have '\0', leave it to strlen to compute
-      // its length.
-      if (NullTermIdx == StringRef::npos)
-        return nullptr;
-     
+    ConstantDataArraySlice Slice;
+    if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) {
+      uint64_t NullTermIdx;
+      if (Slice.Array == nullptr) {
+        NullTermIdx = 0;
+      } else {
+        NullTermIdx = ~((uint64_t)0);
+        for (uint64_t I = 0, E = Slice.Length; I < E; ++I) {
+          if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) {
+            NullTermIdx = I;
+            break;
+          }
+        }
+        // If the string does not have '\0', leave it to strlen to compute
+        // its length.
+        if (NullTermIdx == ~((uint64_t)0))
+          return nullptr;
+      }
+
       Value *Offset = GEP->getOperand(2);
       unsigned BitWidth = Offset->getType()->getIntegerBitWidth();
       KnownBits Known(BitWidth);
       computeKnownBits(Offset, Known, DL, 0, nullptr, CI, nullptr);
       Known.Zero.flipAllBits();
-      size_t ArrSize = 
+      uint64_t ArrSize =
              cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
 
-      // KnownZero's bits are flipped, so zeros in KnownZero now represent 
-      // bits known to be zeros in Offset, and ones in KnowZero represent 
+      // KnownZero's bits are flipped, so zeros in KnownZero now represent
+      // bits known to be zeros in Offset, and ones in KnowZero represent
       // bits unknown in Offset. Therefore, Offset is known to be in range
-      // [0, NullTermIdx] when the flipped KnownZero is non-negative and 
+      // [0, NullTermIdx] when the flipped KnownZero is non-negative and
       // unsigned-less-than NullTermIdx.
       //
-      // If Offset is not provably in the range [0, NullTermIdx], we can still 
-      // optimize if we can prove that the program has undefined behavior when 
-      // Offset is outside that range. That is the case when GEP->getOperand(0) 
+      // If Offset is not provably in the range [0, NullTermIdx], we can still
+      // optimize if we can prove that the program has undefined behavior when
+      // Offset is outside that range. That is the case when GEP->getOperand(0)
       // is a pointer to an object whose memory extent is NullTermIdx+1.
-      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) || 
+      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
           (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
-           NullTermIdx == ArrSize - 1))
-        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), 
+           NullTermIdx == ArrSize - 1)) {
+        Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
+        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
                            Offset);
+      }
     }
 
     return nullptr;
@@ -484,8 +497,8 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
 
   // strlen(x?"foo":"bars") --> x ? 3 : 4
   if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
-    uint64_t LenTrue = GetStringLength(SI->getTrueValue());
-    uint64_t LenFalse = GetStringLength(SI->getFalseValue());
+    uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize);
+    uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize);
     if (LenTrue && LenFalse) {
       Function *Caller = CI->getParent()->getParent();
       emitOptimizationRemark(CI->getContext(), "simplify-libcalls", *Caller,
@@ -505,6 +518,17 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+  return optimizeStringLength(CI, B, 8);
+}
+
+Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
+  Module &M = *CI->getParent()->getParent()->getParent();
+  unsigned WCharSize = TLI->getWCharSize(M) * 8;
+
+  return optimizeStringLength(CI, B, WCharSize);
+}
+
 Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
@@ -2026,6 +2050,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeMemMove(CI, Builder);
     case LibFunc_memset:
       return optimizeMemSet(CI, Builder);
+    case LibFunc_wcslen:
+      return optimizeWcslen(CI, Builder);
     default:
       break;
     }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 516ab7d03a88..1dc554bede7e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3047,7 +3047,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     if (CreateGatherScatter) {
       Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
       NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
-                                         0, "wide.masked.gather");
+                                         nullptr, "wide.masked.gather");
       Entry[Part] = NewLI;
     } else {
       // Calculate the pointer for the specific unroll-part.
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 64013d6d687d..e6f78e6b94a3 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -578,12 +578,12 @@ private:
   void eraseInstruction(Instruction *I) {
     I->removeFromParent();
     I->dropAllReferences();
-    DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+    DeletedInstructions.emplace_back(I);
   }
 
   /// Temporary store for deleted instructions. Instructions will be deleted
   /// eventually when the BoUpSLP is destructed.
-  SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+  SmallVector<unique_value, 8> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 7bee1bd57373..64e0a82456f1 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -378,8 +378,8 @@ entry:
   %cond = icmp eq i32 %a, 42
   br i1 %cond, label %exit, label %unr, !prof !4
 
-; CHECK:  edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK:  edge entry -> unr probability is 0x00000800 / 0x80000000 = 0.00%
+; CHECK:  edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+; CHECK:  edge entry -> unr probability is 0x00000001 / 0x80000000 = 0.00%
 
 unr:
   unreachable
@@ -396,8 +396,8 @@ entry:
   %cond = icmp eq i32 %a, 42
   br i1 %cond, label %exit, label %unr, !prof !5
 
-; CHECK:  edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK:  edge entry -> unr probability is 0x00000800 / 0x80000000 = 0.00%
+; CHECK:  edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+; CHECK:  edge entry -> unr probability is 0x00000001 / 0x80000000 = 0.00%
 
 unr:
   unreachable
@@ -406,7 +406,7 @@ exit:
   ret i32 %b
 }
 
-!5 = !{!"branch_weights", i32 1048575, i32 1}
+!5 = !{!"branch_weights", i32 2147483647, i32 1}
 
 define i32 @test_unreachable_with_prof_zero(i32 %a, i32 %b) {
 ; CHECK: Printing analysis {{.*}} for function 'test_unreachable_with_prof_zero'
@@ -414,8 +414,8 @@ entry:
   %cond = icmp eq i32 %a, 42
   br i1 %cond, label %exit, label %unr, !prof !6
 
-; CHECK:  edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK:  edge entry -> unr probability is 0x00000800 / 0x80000000 = 0.00%
+; CHECK:  edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+; CHECK:  edge entry -> unr probability is 0x00000001 / 0x80000000 = 0.00%
 
 unr:
   unreachable
@@ -451,11 +451,11 @@ entry:
                                  i32 2, label %case_c
                                  i32 3, label %case_d
                                  i32 4, label %case_e ], !prof !8
-; CHECK: edge entry -> case_a probability is 0x00000800 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_b probability is 0x07fffdff / 0x80000000 = 6.25%
-; CHECK: edge entry -> case_c probability is 0x67fffdff / 0x80000000 = 81.25% [HOT edge]
-; CHECK: edge entry -> case_d probability is 0x07fffdff / 0x80000000 = 6.25%
-; CHECK: edge entry -> case_e probability is 0x07fffdff / 0x80000000 = 6.25%
+; CHECK: edge entry -> case_a probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_b probability is 0x07ffffff / 0x80000000 = 6.25%
+; CHECK: edge entry -> case_c probability is 0x67ffffff / 0x80000000 = 81.25% [HOT edge]
+; CHECK: edge entry -> case_d probability is 0x07ffffff / 0x80000000 = 6.25%
+; CHECK: edge entry -> case_e probability is 0x07ffffff / 0x80000000 = 6.25%
 
 case_a:
   unreachable
@@ -493,11 +493,11 @@ entry:
                                  i32 2, label %case_c
                                  i32 3, label %case_d
                                  i32 4, label %case_e ], !prof !9
-; CHECK: edge entry -> case_a probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x6aaaa7ff / 0x80000000 = 83.33% [HOT edge]
-; CHECK: edge entry -> case_d probability is 0x0aaaa7ff / 0x80000000 = 8.33%
-; CHECK: edge entry -> case_e probability is 0x0aaaa7ff / 0x80000000 = 8.33%
+; CHECK: edge entry -> case_a probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_b probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_c probability is 0x6aaaaaa9 / 0x80000000 = 83.33% [HOT edge]
+; CHECK: edge entry -> case_d probability is 0x0aaaaaa9 / 0x80000000 = 8.33%
+; CHECK: edge entry -> case_e probability is 0x0aaaaaa9 / 0x80000000 = 8.33%
 
 case_a:
   unreachable
@@ -534,10 +534,10 @@ entry:
                                  i32 3, label %case_d
                                  i32 4, label %case_e ], !prof !10
 ; CHECK: edge entry -> case_a probability is 0x00000000 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x6e08fa2d / 0x80000000 = 85.96% [HOT edge]
-; CHECK: edge entry -> case_d probability is 0x08fb80e9 / 0x80000000 = 7.02%
-; CHECK: edge entry -> case_e probability is 0x08fb80e9 / 0x80000000 = 7.02%
+; CHECK: edge entry -> case_b probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_c probability is 0x6e08fb82 / 0x80000000 = 85.96% [HOT edge]
+; CHECK: edge entry -> case_d probability is 0x08fb823e / 0x80000000 = 7.02%
+; CHECK: edge entry -> case_e probability is 0x08fb823e / 0x80000000 = 7.02%
 
 case_a:
   unreachable
diff --git a/test/Analysis/BranchProbabilityInfo/deopt-intrinsic.ll b/test/Analysis/BranchProbabilityInfo/deopt-intrinsic.ll
index faa09f9e8a0c..c2681e5e7c80 100644
--- a/test/Analysis/BranchProbabilityInfo/deopt-intrinsic.ll
+++ b/test/Analysis/BranchProbabilityInfo/deopt-intrinsic.ll
@@ -9,8 +9,8 @@ entry:
   %cond = icmp eq i32 %a, 42
   br i1 %cond, label %exit, label %deopt
 
-; CHECK:  edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK:  edge entry -> deopt probability is 0x00000800 / 0x80000000 = 0.00%
+; CHECK:  edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+; CHECK:  edge entry -> deopt probability is 0x00000001 / 0x80000000 = 0.00%
 
 deopt:
   %rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
diff --git a/test/Analysis/BranchProbabilityInfo/noreturn.ll b/test/Analysis/BranchProbabilityInfo/noreturn.ll
index 0c2fe863d034..0566ca16c2f3 100644
--- a/test/Analysis/BranchProbabilityInfo/noreturn.ll
+++ b/test/Analysis/BranchProbabilityInfo/noreturn.ll
@@ -9,8 +9,8 @@ define i32 @test1(i32 %a, i32 %b) {
 entry:
   %cond = icmp eq i32 %a, 42
   br i1 %cond, label %exit, label %abort
-; CHECK: edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK: edge entry -> abort probability is 0x00000800 / 0x80000000 = 0.00%
+; CHECK: edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+; CHECK: edge entry -> abort probability is 0x00000001 / 0x80000000 = 0.00%
 
 abort:
   call void @abort() noreturn
@@ -27,11 +27,11 @@ entry:
                               i32 2, label %case_b
                               i32 3, label %case_c
                               i32 4, label %case_d]
-; CHECK: edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK: edge entry -> case_a probability is 0x00000200 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_b probability is 0x00000200 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x00000200 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_d probability is 0x00000200 / 0x80000000 = 0.00%
+; CHECK: edge entry -> exit probability is 0x7ffffffc / 0x80000000 = 100.00% [HOT edge]
+; CHECK: edge entry -> case_a probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_b probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_c probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> case_d probability is 0x00000001 / 0x80000000 = 0.00%
 
 case_a:
   br label %case_b
@@ -56,8 +56,8 @@ define i32 @test3(i32 %a, i32 %b) {
 entry:
   %cond1 = icmp eq i32 %a, 42
   br i1 %cond1, label %exit, label %dom
-; CHECK: edge entry -> exit probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
-; CHECK: edge entry -> dom probability is 0x00000800 / 0x80000000 = 0.00%
+; CHECK: edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+; CHECK: edge entry -> dom probability is 0x00000001 / 0x80000000 = 0.00%
 
 dom:
   %cond2 = icmp ult i32 %a, 42
@@ -87,8 +87,8 @@ define i32 @throwSmallException(i32 %idx, i32 %limit) #0 personality i8* bitcast
 entry:
   %cmp = icmp sge i32 %idx, %limit
   br i1 %cmp, label %if.then, label %if.end
-; CHECK: edge entry -> if.then probability is 0x00000800 / 0x80000000 = 0.00%
-; CHECK: edge entry -> if.end probability is 0x7ffff800 / 0x80000000 = 100.00% [HOT edge]
+; CHECK: edge entry -> if.then probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> if.end probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
 
 if.then:                                          ; preds = %entry
   %exception = call i8* @__cxa_allocate_exception(i64 1) #0
diff --git a/test/Analysis/CostModel/X86/ctlz.ll b/test/Analysis/CostModel/X86/ctlz.ll
index 2c97da15aee5..769d73915e36 100644
--- a/test/Analysis/CostModel/X86/ctlz.ll
+++ b/test/Analysis/CostModel/X86/ctlz.ll
@@ -1,9 +1,12 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2 -check-prefix=NOPOPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl -mattr=-avx512cd -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512 -check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -mattr=-avx512cd -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512 -check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -mattr=+avx512cd -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512CD
 
 ; Verify the cost of scalar leading zero count instructions.
 
@@ -80,11 +83,18 @@ declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
 declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
 declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)
 
+declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1)
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
+declare <32 x i16> @llvm.ctlz.v32i16(<32 x i16>, i1)
+declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>, i1)
+
 define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64':
 ; SSE2: Found an estimated cost of 25 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 23 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0)
   ret <2 x i64> %ctlz
 }
@@ -94,6 +104,8 @@ define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
 ; SSE2: Found an estimated cost of 25 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 23 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1)
   ret <2 x i64> %ctlz
 }
@@ -104,6 +116,8 @@ define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
 ; SSE42: Found an estimated cost of 46 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 48 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 0)
   ret <4 x i64> %ctlz
 }
@@ -114,15 +128,45 @@ define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
 ; SSE42: Found an estimated cost of 46 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 48 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 1)
   ret <4 x i64> %ctlz
 }
 
+define <8 x i64> @var_ctlz_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i64':
+; SSE2: Found an estimated cost of 100 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 92 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 96 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 46 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 29 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
+  %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 0)
+  ret <8 x i64> %ctlz
+}
+
+define <8 x i64> @var_ctlz_v8i64u(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i64u':
+; SSE2: Found an estimated cost of 100 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 92 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 96 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 46 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 29 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
+  %ctlz = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 1)
+  ret <8 x i64> %ctlz
+}
+
 define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32':
 ; SSE2: Found an estimated cost of 26 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0)
   ret <4 x i32> %ctlz
 }
@@ -132,6 +176,8 @@ define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
 ; SSE2: Found an estimated cost of 26 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1)
   ret <4 x i32> %ctlz
 }
@@ -142,6 +188,8 @@ define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
 ; SSE42: Found an estimated cost of 36 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 38 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 0)
   ret <8 x i32> %ctlz
 }
@@ -152,15 +200,45 @@ define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
 ; SSE42: Found an estimated cost of 36 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 38 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
   %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 1)
   ret <8 x i32> %ctlz
 }
 
+define <16 x i32> @var_ctlz_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i32':
+; SSE2: Found an estimated cost of 104 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 72 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 76 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 35 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 22 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
+  %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 0)
+  ret <16 x i32> %ctlz
+}
+
+define <16 x i32> @var_ctlz_v16i32u(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i32u':
+; SSE2: Found an estimated cost of 104 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 72 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 76 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 35 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 22 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 1 for instruction:   %ctlz
+  %ctlz = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 1)
+  ret <16 x i32> %ctlz
+}
+
 define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16':
 ; SSE2: Found an estimated cost of 20 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 14 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 4 for instruction:   %ctlz
   %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0)
   ret <8 x i16> %ctlz
 }
@@ -170,6 +248,8 @@ define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
 ; SSE2: Found an estimated cost of 20 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 14 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 4 for instruction:   %ctlz
   %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1)
   ret <8 x i16> %ctlz
 }
@@ -180,6 +260,8 @@ define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
 ; SSE42: Found an estimated cost of 28 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 30 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 4 for instruction:   %ctlz
   %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 0)
   ret <16 x i16> %ctlz
 }
@@ -190,15 +272,45 @@ define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
 ; SSE42: Found an estimated cost of 28 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 30 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 4 for instruction:   %ctlz
   %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 1)
   ret <16 x i16> %ctlz
 }
 
+define <32 x i16> @var_ctlz_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i16':
+; SSE2: Found an estimated cost of 80 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 56 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 60 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 8 for instruction:   %ctlz
+  %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 0)
+  ret <32 x i16> %ctlz
+}
+
+define <32 x i16> @var_ctlz_v32i16u(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i16u':
+; SSE2: Found an estimated cost of 80 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 56 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 60 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 8 for instruction:   %ctlz
+  %ctlz = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %a, i1 1)
+  ret <32 x i16> %ctlz
+}
+
 define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8':
 ; SSE2: Found an estimated cost of 17 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 9 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 4 for instruction:   %ctlz
   %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0)
   ret <16 x i8> %ctlz
 }
@@ -208,6 +320,8 @@ define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
 ; SSE2: Found an estimated cost of 17 for instruction:   %ctlz
 ; SSE42: Found an estimated cost of 9 for instruction:   %ctlz
 ; AVX: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 4 for instruction:   %ctlz
   %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1)
   ret <16 x i8> %ctlz
 }
@@ -218,6 +332,8 @@ define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
 ; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 20 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 10 for instruction:   %ctlz
   %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 0)
   ret <32 x i8> %ctlz
 }
@@ -228,6 +344,34 @@ define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
 ; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
 ; AVX1: Found an estimated cost of 20 for instruction:   %ctlz
 ; AVX2: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 10 for instruction:   %ctlz
   %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 1)
   ret <32 x i8> %ctlz
 }
+
+define <64 x i8> @var_ctlz_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v64i8':
+; SSE2: Found an estimated cost of 68 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 40 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 17 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 20 for instruction:   %ctlz
+  %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 0)
+  ret <64 x i8> %ctlz
+}
+
+define <64 x i8> @var_ctlz_v64i8u(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v64i8u':
+; SSE2: Found an estimated cost of 68 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 40 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512F: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX512BW: Found an estimated cost of 17 for instruction:   %ctlz
+; AVX512CD: Found an estimated cost of 20 for instruction:   %ctlz
+  %ctlz = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %a, i1 1)
+  ret <64 x i8> %ctlz
+}
diff --git a/test/Analysis/CostModel/X86/ctpop.ll b/test/Analysis/CostModel/X86/ctpop.ll
index f072cbaec492..e6a14e98e37a 100644
--- a/test/Analysis/CostModel/X86/ctpop.ll
+++ b/test/Analysis/CostModel/X86/ctpop.ll
@@ -4,6 +4,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512 -check-prefix=AVX512F -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512 -check-prefix=AVX512BW -check-prefix=POPCNT
 
 ; Verify the cost of scalar population count instructions.
 
@@ -56,11 +58,17 @@ declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
 declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
 declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
 
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
+declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>)
+declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>)
+
 define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v2i64':
 ; SSE2: Found an estimated cost of 12 for instruction:   %ctpop
 ; SSE42: Found an estimated cost of 7 for instruction:   %ctpop
 ; AVX: Found an estimated cost of 7 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 7 for instruction:   %ctpop
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %ctpop
 }
@@ -71,15 +79,29 @@ define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
 ; SSE42: Found an estimated cost of 14 for instruction:   %ctpop
 ; AVX1: Found an estimated cost of 16 for instruction:   %ctpop
 ; AVX2: Found an estimated cost of 7 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 7 for instruction:   %ctpop
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
   ret <4 x i64> %ctpop
 }
 
+define <8 x i64> @var_ctpop_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i64':
+; SSE2: Found an estimated cost of 48 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 28 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 32 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 14 for instruction:   %ctpop
+; AVX512F: Found an estimated cost of 16 for instruction:   %ctpop
+; AVX512BW: Found an estimated cost of 7 for instruction:   %ctpop
+  %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+  ret <8 x i64> %ctpop
+}
+
 define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i32':
 ; SSE2: Found an estimated cost of 15 for instruction:   %ctpop
 ; SSE42: Found an estimated cost of 11 for instruction:   %ctpop
 ; AVX: Found an estimated cost of 11 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 11 for instruction:   %ctpop
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
   ret <4 x i32> %ctpop
 }
@@ -90,15 +112,29 @@ define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
 ; SSE42: Found an estimated cost of 22 for instruction:   %ctpop
 ; AVX1: Found an estimated cost of 24 for instruction:   %ctpop
 ; AVX2: Found an estimated cost of 11 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 11 for instruction:   %ctpop
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
   ret <8 x i32> %ctpop
 }
 
+define <16 x i32> @var_ctpop_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i32':
+; SSE2: Found an estimated cost of 60 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 44 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 48 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 22 for instruction:   %ctpop
+; AVX512F: Found an estimated cost of 24 for instruction:   %ctpop
+; AVX512BW: Found an estimated cost of 11 for instruction:   %ctpop
+  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
+  ret <16 x i32> %ctpop
+}
+
 define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i16':
 ; SSE2: Found an estimated cost of 13 for instruction:   %ctpop
 ; SSE42: Found an estimated cost of 9 for instruction:   %ctpop
 ; AVX: Found an estimated cost of 9 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 9 for instruction:   %ctpop
   %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
   ret <8 x i16> %ctpop
 }
@@ -109,15 +145,29 @@ define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
 ; SSE42: Found an estimated cost of 18 for instruction:   %ctpop
 ; AVX1: Found an estimated cost of 20 for instruction:   %ctpop
 ; AVX2: Found an estimated cost of 9 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 9 for instruction:   %ctpop
   %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
   ret <16 x i16> %ctpop
 }
 
+define <32 x i16> @var_ctpop_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v32i16':
+; SSE2: Found an estimated cost of 52 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 36 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 40 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 18 for instruction:   %ctpop
+; AVX512F: Found an estimated cost of 18 for instruction:   %ctpop
+; AVX512BW: Found an estimated cost of 9 for instruction:   %ctpop
+  %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
+  ret <32 x i16> %ctpop
+}
+
 define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i8':
 ; SSE2: Found an estimated cost of 10 for instruction:   %ctpop
 ; SSE42: Found an estimated cost of 6 for instruction:   %ctpop
 ; AVX: Found an estimated cost of 6 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 6 for instruction:   %ctpop
   %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
   ret <16 x i8> %ctpop
 }
@@ -128,6 +178,19 @@ define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
 ; SSE42: Found an estimated cost of 12 for instruction:   %ctpop
 ; AVX1: Found an estimated cost of 14 for instruction:   %ctpop
 ; AVX2: Found an estimated cost of 6 for instruction:   %ctpop
+; AVX512: Found an estimated cost of 6 for instruction:   %ctpop
   %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
   ret <32 x i8> %ctpop
 }
+
+define <64 x i8> @var_ctpop_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v64i8':
+; SSE2: Found an estimated cost of 40 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 24 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 28 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 12 for instruction:   %ctpop
+; AVX512F: Found an estimated cost of 12 for instruction:   %ctpop
+; AVX512BW: Found an estimated cost of 6 for instruction:   %ctpop
+  %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
+  ret <64 x i8> %ctpop
+}
diff --git a/test/Analysis/CostModel/X86/cttz.ll b/test/Analysis/CostModel/X86/cttz.ll
index 5d3c59b60232..e7a39781385e 100644
--- a/test/Analysis/CostModel/X86/cttz.ll
+++ b/test/Analysis/CostModel/X86/cttz.ll
@@ -1,9 +1,11 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2 -check-prefix=NOPOPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512 -check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX512 -check-prefix=AVX512BW
 
 ; Verify the cost of scalar trailing zero count instructions.
 
@@ -80,11 +82,17 @@ declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
 
+declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>, i1)
+declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
+declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>, i1)
+declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>, i1)
+
 define <2 x i64> @var_cttz_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64':
 ; SSE2: Found an estimated cost of 14 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 10 for instruction:   %cttz
 ; AVX: Found an estimated cost of 10 for instruction:   %cttz
+; AVX512: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 0)
   ret <2 x i64> %cttz
 }
@@ -94,6 +102,7 @@ define <2 x i64> @var_cttz_v2i64u(<2 x i64> %a) {
 ; SSE2: Found an estimated cost of 14 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 10 for instruction:   %cttz
 ; AVX: Found an estimated cost of 10 for instruction:   %cttz
+; AVX512: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 1)
   ret <2 x i64> %cttz
 }
@@ -104,6 +113,7 @@ define <4 x i64> @var_cttz_v4i64(<4 x i64> %a) {
 ; SSE42: Found an estimated cost of 20 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 22 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 10 for instruction:   %cttz
+; AVX512: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 0)
   ret <4 x i64> %cttz
 }
@@ -114,15 +124,41 @@ define <4 x i64> @var_cttz_v4i64u(<4 x i64> %a) {
 ; SSE42: Found an estimated cost of 20 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 22 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 10 for instruction:   %cttz
+; AVX512: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 1)
   ret <4 x i64> %cttz
 }
 
+define <8 x i64> @var_cttz_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i64':
+; SSE2: Found an estimated cost of 56 for instruction:   %cttz
+; SSE42: Found an estimated cost of 40 for instruction:   %cttz
+; AVX1: Found an estimated cost of 44 for instruction:   %cttz
+; AVX2: Found an estimated cost of 20 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 20 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 10 for instruction:   %cttz
+  %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 0)
+  ret <8 x i64> %cttz
+}
+
+define <8 x i64> @var_cttz_v8i64u(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i64u':
+; SSE2: Found an estimated cost of 56 for instruction:   %cttz
+; SSE42: Found an estimated cost of 40 for instruction:   %cttz
+; AVX1: Found an estimated cost of 44 for instruction:   %cttz
+; AVX2: Found an estimated cost of 20 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 20 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 10 for instruction:   %cttz
+  %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 1)
+  ret <8 x i64> %cttz
+}
+
 define <4 x i32> @var_cttz_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32':
 ; SSE2: Found an estimated cost of 18 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 14 for instruction:   %cttz
 ; AVX: Found an estimated cost of 14 for instruction:   %cttz
+; AVX512: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 0)
   ret <4 x i32> %cttz
 }
@@ -132,6 +168,7 @@ define <4 x i32> @var_cttz_v4i32u(<4 x i32> %a) {
 ; SSE2: Found an estimated cost of 18 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 14 for instruction:   %cttz
 ; AVX: Found an estimated cost of 14 for instruction:   %cttz
+; AVX512: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 1)
   ret <4 x i32> %cttz
 }
@@ -142,6 +179,7 @@ define <8 x i32> @var_cttz_v8i32(<8 x i32> %a) {
 ; SSE42: Found an estimated cost of 28 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 30 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 14 for instruction:   %cttz
+; AVX512: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 0)
   ret <8 x i32> %cttz
 }
@@ -152,15 +190,41 @@ define <8 x i32> @var_cttz_v8i32u(<8 x i32> %a) {
 ; SSE42: Found an estimated cost of 28 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 30 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 14 for instruction:   %cttz
+; AVX512: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 1)
   ret <8 x i32> %cttz
 }
 
+define <16 x i32> @var_cttz_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i32':
+; SSE2: Found an estimated cost of 72 for instruction:   %cttz
+; SSE42: Found an estimated cost of 56 for instruction:   %cttz
+; AVX1: Found an estimated cost of 60 for instruction:   %cttz
+; AVX2: Found an estimated cost of 28 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 28 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 14 for instruction:   %cttz
+  %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 0)
+  ret <16 x i32> %cttz
+}
+
+define <16 x i32> @var_cttz_v16i32u(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i32u':
+; SSE2: Found an estimated cost of 72 for instruction:   %cttz
+; SSE42: Found an estimated cost of 56 for instruction:   %cttz
+; AVX1: Found an estimated cost of 60 for instruction:   %cttz
+; AVX2: Found an estimated cost of 28 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 28 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 14 for instruction:   %cttz
+  %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 1)
+  ret <16 x i32> %cttz
+}
+
 define <8 x i16> @var_cttz_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16':
 ; SSE2: Found an estimated cost of 16 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 12 for instruction:   %cttz
 ; AVX: Found an estimated cost of 12 for instruction:   %cttz
+; AVX512: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 0)
   ret <8 x i16> %cttz
 }
@@ -170,6 +234,7 @@ define <8 x i16> @var_cttz_v8i16u(<8 x i16> %a) {
 ; SSE2: Found an estimated cost of 16 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 12 for instruction:   %cttz
 ; AVX: Found an estimated cost of 12 for instruction:   %cttz
+; AVX512: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 1)
   ret <8 x i16> %cttz
 }
@@ -180,6 +245,7 @@ define <16 x i16> @var_cttz_v16i16(<16 x i16> %a) {
 ; SSE42: Found an estimated cost of 24 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 26 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 12 for instruction:   %cttz
+; AVX512: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 0)
   ret <16 x i16> %cttz
 }
@@ -190,15 +256,41 @@ define <16 x i16> @var_cttz_v16i16u(<16 x i16> %a) {
 ; SSE42: Found an estimated cost of 24 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 26 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 12 for instruction:   %cttz
+; AVX512: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 1)
   ret <16 x i16> %cttz
 }
 
+define <32 x i16> @var_cttz_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i16':
+; SSE2: Found an estimated cost of 64 for instruction:   %cttz
+; SSE42: Found an estimated cost of 48 for instruction:   %cttz
+; AVX1: Found an estimated cost of 52 for instruction:   %cttz
+; AVX2: Found an estimated cost of 24 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 24 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 12 for instruction:   %cttz
+  %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 0)
+  ret <32 x i16> %cttz
+}
+
+define <32 x i16> @var_cttz_v32i16u(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i16u':
+; SSE2: Found an estimated cost of 64 for instruction:   %cttz
+; SSE42: Found an estimated cost of 48 for instruction:   %cttz
+; AVX1: Found an estimated cost of 52 for instruction:   %cttz
+; AVX2: Found an estimated cost of 24 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 24 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 12 for instruction:   %cttz
+  %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 1)
+  ret <32 x i16> %cttz
+}
+
 define <16 x i8> @var_cttz_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8':
 ; SSE2: Found an estimated cost of 13 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 9 for instruction:   %cttz
 ; AVX: Found an estimated cost of 9 for instruction:   %cttz
+; AVX512: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 0)
   ret <16 x i8> %cttz
 }
@@ -208,6 +300,7 @@ define <16 x i8> @var_cttz_v16i8u(<16 x i8> %a) {
 ; SSE2: Found an estimated cost of 13 for instruction:   %cttz
 ; SSE42: Found an estimated cost of 9 for instruction:   %cttz
 ; AVX: Found an estimated cost of 9 for instruction:   %cttz
+; AVX512: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 1)
   ret <16 x i8> %cttz
 }
@@ -218,6 +311,7 @@ define <32 x i8> @var_cttz_v32i8(<32 x i8> %a) {
 ; SSE42: Found an estimated cost of 18 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 20 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 9 for instruction:   %cttz
+; AVX512: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 0)
   ret <32 x i8> %cttz
 }
@@ -228,6 +322,31 @@ define <32 x i8> @var_cttz_v32i8u(<32 x i8> %a) {
 ; SSE42: Found an estimated cost of 18 for instruction:   %cttz
 ; AVX1: Found an estimated cost of 20 for instruction:   %cttz
 ; AVX2: Found an estimated cost of 9 for instruction:   %cttz
+; AVX512: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 1)
   ret <32 x i8> %cttz
 }
+
+define <64 x i8> @var_cttz_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v64i8':
+; SSE2: Found an estimated cost of 52 for instruction:   %cttz
+; SSE42: Found an estimated cost of 36 for instruction:   %cttz
+; AVX1: Found an estimated cost of 40 for instruction:   %cttz
+; AVX2: Found an estimated cost of 18 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 18 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 9 for instruction:   %cttz
+  %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 0)
+  ret <64 x i8> %cttz
+}
+
+define <64 x i8> @var_cttz_v64i8u(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v64i8u':
+; SSE2: Found an estimated cost of 52 for instruction:   %cttz
+; SSE42: Found an estimated cost of 36 for instruction:   %cttz
+; AVX1: Found an estimated cost of 40 for instruction:   %cttz
+; AVX2: Found an estimated cost of 18 for instruction:   %cttz
+; AVX512F: Found an estimated cost of 18 for instruction:   %cttz
+; AVX512BW: Found an estimated cost of 9 for instruction:   %cttz
+  %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 1)
+  ret <64 x i8> %cttz
+}
diff --git a/test/Analysis/ScalarEvolution/nsw.ll b/test/Analysis/ScalarEvolution/nsw.ll
index a3752919d334..39b958d3ea0e 100644
--- a/test/Analysis/ScalarEvolution/nsw.ll
+++ b/test/Analysis/ScalarEvolution/nsw.ll
@@ -102,7 +102,7 @@ for.body.i.i:                                     ; preds = %entry, %for.body.i.
   %cmp.i.i = icmp eq i32* %ptrincdec.i.i, %end
   br i1 %cmp.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i
 ; CHECK: Loop %for.body.i.i: backedge-taken count is ((-4 + (-1 * %begin) + %end) /u 4)
-; CHECK: Loop %for.body.i.i: max backedge-taken count is ((-4 + (-1 * %begin) + %end) /u 4)
+; CHECK: Loop %for.body.i.i: max backedge-taken count is 4611686018427387903
 _ZSt4fillIPiiEvT_S1_RKT0_.exit:                   ; preds = %for.body.i.i, %entry
   ret void
 }
diff --git a/test/Analysis/ScalarEvolution/trip-count-pow2.ll b/test/Analysis/ScalarEvolution/trip-count-pow2.ll
index 8d053060b50c..04d1b9544ab2 100644
--- a/test/Analysis/ScalarEvolution/trip-count-pow2.ll
+++ b/test/Analysis/ScalarEvolution/trip-count-pow2.ll
@@ -14,7 +14,7 @@ exit:
 
 ; CHECK-LABEL: @test1
 ; CHECK: Loop %loop: backedge-taken count is ((-32 + (96 * %n)) /u 32)
-; CHECK: Loop %loop: max backedge-taken count is ((-32 + (96 * %n)) /u 32)
+; CHECK: Loop %loop: max backedge-taken count is 134217727
 }
 
 ; PR19183
@@ -32,7 +32,7 @@ exit:
 
 ; CHECK-LABEL: @test2
 ; CHECK: Loop %loop: backedge-taken count is ((-32 + (32 * (%n /u 32))) /u 32)
-; CHECK: Loop %loop: max backedge-taken count is ((-32 + (32 * (%n /u 32))) /u 32)
+; CHECK: Loop %loop: max backedge-taken count is 134217727
 }
 
 define void @test3(i32 %n) {
@@ -49,7 +49,7 @@ exit:
 
 ; CHECK-LABEL: @test3
 ; CHECK: Loop %loop: backedge-taken count is ((-32 + (32 * %n)) /u 32)
-; CHECK: Loop %loop: max backedge-taken count is ((-32 + (32 * %n)) /u 32)
+; CHECK: Loop %loop: max backedge-taken count is 134217727
 }
 
 define void @test4(i32 %n) {
@@ -66,7 +66,7 @@ exit:
 
 ; CHECK-LABEL: @test4
 ; CHECK: Loop %loop: backedge-taken count is ((-4 + (-1431655764 * %n)) /u 4)
-; CHECK: Loop %loop: max backedge-taken count is ((-4 + (-1431655764 * %n)) /u 4)
+; CHECK: Loop %loop: max backedge-taken count is 1073741823
 }
 
 define void @test5(i32 %n) {
@@ -83,5 +83,5 @@ exit:
 
 ; CHECK-LABEL: @test5
 ; CHECK: Loop %loop: backedge-taken count is ((-4 + (4 * %n)) /u 4)
-; CHECK: Loop %loop: max backedge-taken count is ((-4 + (4 * %n)) /u 4)
+; CHECK: Loop %loop: max backedge-taken count is 1073741823
 }
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index fc1aeb7b37d9..2682fa7dcce1 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -378,11 +378,11 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; CHECK-NEXT: cmp x0, #13
 ; CHECK-NOT: ccmp
 ; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
-; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
 ; CHECK-NEXT: cmp x2, #2
 ; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt
 ; CHECK-NEXT: cmp x2, #4
 ; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt
+; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
 ; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]]
 ; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]]
 ; CHECK-NEXT: cmp [[REG6]], #0
diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
index 4c0195b93a44..3593668e0156 100644
--- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll
+++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
@@ -12,7 +12,7 @@
 ; CHECK: Successors:
 ; CHECK-NOT: ch SU(4)
 ; CHECK: SU(3)
-; CHECK: SU(5):   STRWui %WZR, %X{{[0-9]+}}
+; CHECK: SU(4):   STRWui %WZR, %X{{[0-9]+}}
 define i32 @foo() {
 entry:
   %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4
diff --git a/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index 5b7800996133..cdfb667c26bd 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -6,7 +6,8 @@
 ; Tests for add.
 ; CHECK: name: addi32
 ; CHECK: {{%[0-9]+}}(s32) = G_ADD
-define i32 @addi32(i32 %arg1, i32 %arg2) {
+define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
   %res = add i32 %arg1, %arg2
-  ret i32 %res
+  store i32 %res, i32 addrspace(1)* undef
+  ret void
 }
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index 3b274c9d2027..bee13d8c17f1 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -84,11 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
-; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
index 73e80d523f1e..a6b280578531 100644
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -202,10 +202,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
 
-; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-; VI: v_add_u16_e32
-; VI: v_add_u16_e32
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI-DAG: v_add_u16_e32
+; VI-DAG: v_add_u16_e32
 
 ; VI: buffer_store_dwordx4
 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
diff --git a/test/CodeGen/AMDGPU/bfe-patterns.ll b/test/CodeGen/AMDGPU/bfe-patterns.ll
index c23cc1c88b52..907c8c2216b7 100644
--- a/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
 define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/coalescer_distribute.ll b/test/CodeGen/AMDGPU/coalescer_distribute.ll
index 7ca2612598c8..d0276a3fb59c 100644
--- a/test/CodeGen/AMDGPU/coalescer_distribute.ll
+++ b/test/CodeGen/AMDGPU/coalescer_distribute.ll
@@ -5,7 +5,7 @@ target triple = "amdgcn--"
 
 define spir_kernel void @hoge() {
 bb:
-  %tmp = tail call i32 @llvm.r600.read.tidig.x()
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br i1 undef, label %bb2, label %bb23
 
 bb2:
@@ -50,4 +50,4 @@ bb34:
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x()
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index e252971e3f42..149c50685b1d 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -135,7 +135,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64:
-; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
@@ -145,7 +144,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
-; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
 define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 87ba563a740f..48f3e4401f1a 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -121,8 +121,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
 ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
 ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
-; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
 define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index ab1cf0ba25b5..0f49919a1d10 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -266,8 +266,8 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)*
 }
 
 ; SI-LABEL: @simple_write2_one_val_f64
-; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
-; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
 define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
index bd861e0c663e..3ae74abcb6cb 100644
--- a/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s
 
 ; This tests that the llvm.SI.end.cf intrinsic is not inserted into the
 ; loop block.  This intrinsic will be lowered to s_or_b64 by the code
@@ -14,7 +14,7 @@
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
 define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
-  %cond = call i32 @llvm.r600.read.tidig.x() #0
+  %cond = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %loop
 
@@ -34,6 +34,6 @@ done:
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
index d2cfc713ed37..27d9261b1fab 100644
--- a/test/CodeGen/AMDGPU/fmed3.ll
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -845,10 +845,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(
 ; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
-; GCN: v_min_f32
-; GCN: v_max_f32
-; GCN: v_min_f32
-; GCN: v_max_f32
+; GCN-DAG: v_min_f32
+; GCN-DAG: v_max_f32
+; GCN-DAG: v_min_f32
+; GCN-DAG: v_max_f32
 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll
new file mode 100644
index 000000000000..d67988b46325
--- /dev/null
+++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -0,0 +1,124 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Test that non-entry function frame indices are expanded properly to
+; give an index relative to the scratch wave offset register
+
+; Materialize into a mov. Make sure there isn't an unnecessary copy.
+; GCN-LABEL: {{^}}func_mov_fi_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: s_sub_u32 vcc_hi, s5, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_mov_fi_i32() #0 {
+  %alloca = alloca i32
+  store volatile i32* %alloca, i32* addrspace(3)* undef
+  ret void
+}
+
+; Materialize into an add of a constant offset from the FI.
+; FIXME: Should be able to merge adds
+
+; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: s_lshr_b32 s6, s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_add_constant_to_fi_i32() #0 {
+  %alloca = alloca [2 x i32], align 4
+  %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1
+  store volatile i32* %gep0, i32* addrspace(3)* undef
+  ret void
+}
+
+; A user the materialized frame index can't be meaningfully folded
+; into.
+
+; GCN-LABEL: {{^}}func_other_fi_user_i32:
+; GCN: s_sub_u32 vcc_hi, s5, s4
+; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
+; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @func_other_fi_user_i32() #0 {
+  %alloca = alloca [2 x i32], align 4
+  %ptrtoint = ptrtoint [2 x i32]* %alloca to i32
+  %mul = mul i32 %ptrtoint, 9
+  store volatile i32 %mul, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
+; GCN: v_mov_b32_e32 v1, 15{{$}}
+; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}}
+define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 {
+  store volatile i32 15, i32* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
+; GCN: s_waitcnt
+; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}}
+define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
+  %val = load volatile i32, i32* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
+; GCN: s_waitcnt
+; GCN-NEXT: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NOT: v_mov
+; GCN: ds_write_b32 v0, v0
+define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 {
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  %load1 = load i32, i32* %gep1
+  store volatile i32* %gep1, i32* addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5
+; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
+define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 {
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  %load0 = load i8, i8* %gep0
+  %load1 = load i32, i32* %gep1
+  store volatile i8 %load0, i8 addrspace(3)* undef
+  store volatile i32 %load1, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
+; GCN: s_sub_u32 s8, s5, s4
+; GCN: v_lshr_b32_e64 v1, s8, 6
+; GCN: s_and_saveexec_b64
+
+; GCN: v_add_i32_e32 v0, vcc, 4, v1
+; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4
+; GCN: ds_write_b32
+define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 {
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %bb, label %ret
+
+bb:
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  %load1 = load volatile i32, i32* %gep1
+  store volatile i32* %gep1, i32* addrspace(3)* undef
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/function-args.ll b/test/CodeGen/AMDGPU/function-args.ll
new file mode 100644
index 000000000000..9b1368493ba5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/function-args.ll
@@ -0,0 +1,734 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}void_func_i1:
+; GCN: v_and_b32_e32 v0, 1, v0
+; GCN: buffer_store_byte v0, off
+define void @void_func_i1(i1 %arg0) #0 {
+  store i1 %arg0, i1 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i1_zeroext:
+; GCN: s_waitcnt
+; GCN-NEXT: v_or_b32_e32 v0, 12, v0
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
+  %ext = zext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i1_signext:
+; GCN: s_waitcnt
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_i1_signext(i1 signext %arg0) #0 {
+  %ext = sext i1 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i8:
+; GCN-NOT: v0
+; GCN: buffer_store_byte v0, off
+define void @void_func_i8(i8 %arg0) #0 {
+  store i8 %arg0, i8 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i8_zeroext:
+; GCN-NOT: and_b32
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
+  %ext = zext i8 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i8_signext:
+; GCN-NOT: v_bfe_i32
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i8_signext(i8 signext %arg0) #0 {
+  %ext = sext i8 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i16:
+; GCN: buffer_store_short v0, off
+define void @void_func_i16(i16 %arg0) #0 {
+  store i16 %arg0, i16 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i16_zeroext:
+; GCN-NOT: v0
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
+  %ext = zext i16 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i16_signext:
+; GCN-NOT: v0
+; GCN: v_add_i32_e32 v0, vcc, 12, v0
+define void @void_func_i16_signext(i16 signext %arg0) #0 {
+  %ext = sext i16 %arg0 to i32
+  %add = add i32 %ext, 12
+  store i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i32:
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_i32(i32 %arg0) #0 {
+  store i32 %arg0, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_i64:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_i64(i64 %arg0) #0 {
+  store i64 %arg0, i64 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_f16:
+; VI-NOT: v0
+; CI: v_cvt_f16_f32_e32 v0, v0
+; GCN: buffer_store_short v0, off
+define void @void_func_f16(half %arg0) #0 {
+  store half %arg0, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_f32
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_f32(float %arg0) #0 {
+  store float %arg0, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_f64:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_f64(double %arg0) #0 {
+  store double %arg0, double addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2i32:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v2i32(<2 x i32> %arg0) #0 {
+  store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i32:
+; GCN-DAG: buffer_store_dword v2, off
+; GCN-DAG: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v3i32(<3 x i32> %arg0) #0 {
+  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4i32:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v4i32(<4 x i32> %arg0) #0 {
+  store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v5i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dword v4, off
+define void @void_func_v5i32(<5 x i32> %arg0) #0 {
+  store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v8i32(<8 x i32> %arg0) #0 {
+  store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v16i32(<16 x i32> %arg0) #0 {
+  store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+define void @void_func_v32i32(<32 x i32> %arg0) #0 {
+  store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  ret void
+}
+
+; 1 over register limit
+; GCN-LABEL: {{^}}void_func_v33i32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+; GCN: buffer_store_dword [[STACKLOAD]], off
+define void @void_func_v33i32(<33 x i32> %arg0) #0 {
+  store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2i64:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v2i64(<2 x i64> %arg0) #0 {
+  store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx2 v[4:5], off
+define void @void_func_v3i64(<3 x i64> %arg0) #0 {
+  store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v4i64(<4 x i64> %arg0) #0 {
+  store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v5i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx2 v[8:9], off
+define void @void_func_v5i64(<5 x i64> %arg0) #0 {
+  store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v8i64(<8 x i64> %arg0) #0 {
+  store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+define void @void_func_v16i64(<16 x i64> %arg0) #0 {
+  store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2i16:
+; GFX9-NOT: v0
+; GFX9: buffer_store_dword v0, off
+define void @void_func_v2i16(<2 x i16> %arg0) #0 {
+  store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i16:
+; GCN-DAG: buffer_store_dword v0, off
+; GCN-DAG: buffer_store_short v2, off
+define void @void_func_v3i16(<3 x i16> %arg0) #0 {
+  store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4i16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v4i16(<4 x i16> %arg0) #0 {
+  store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v5i16:
+; GCN-DAG: buffer_store_short v4, off,
+; GCN-DAG: buffer_store_dwordx2 v[1:2], off
+define void @void_func_v5i16(<5 x i16> %arg0) #0 {
+  store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8i16:
+; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v8i16(<8 x i16> %arg0) #0 {
+  store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16i16:
+; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
+; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v16i16(<16 x i16> %arg0) #0 {
+  store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2f32:
+; GCN-NOT: v[0:1]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v2f32(<2 x float> %arg0) #0 {
+  store <2 x float> %arg0, <2 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3f32:
+; GCN-DAG: buffer_store_dword v2, off
+; GCN-DAG: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v3f32(<3 x float> %arg0) #0 {
+  store <3 x float> %arg0, <3 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4f32:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v4f32(<4 x float> %arg0) #0 {
+  store <4 x float> %arg0, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8f32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v8f32(<8 x float> %arg0) #0 {
+  store <8 x float> %arg0, <8 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16f32:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v16f32(<16 x float> %arg0) #0 {
+  store <16 x float> %arg0, <16 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2f64:
+; GCN: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v2f64(<2 x double> %arg0) #0 {
+  store <2 x double> %arg0, <2 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx2 v[4:5], off
+define void @void_func_v3f64(<3 x double> %arg0) #0 {
+  store <3 x double> %arg0, <3 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v4f64(<4 x double> %arg0) #0 {
+  store <4 x double> %arg0, <4 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+define void @void_func_v8f64(<8 x double> %arg0) #0 {
+  store <8 x double> %arg0, <8 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16f64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+define void @void_func_v16f64(<16 x double> %arg0) #0 {
+  store <16 x double> %arg0, <16 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v2f16:
+; GFX9-NOT: v0
+; GFX9: buffer_store_dword v0, off
+define void @void_func_v2f16(<2 x half> %arg0) #0 {
+  store <2 x half> %arg0, <2 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3f16:
+; GFX9-NOT: v0
+; GCN-DAG: buffer_store_dword v0, off
+; GCN-DAG: buffer_store_short v2, off
+define void @void_func_v3f16(<3 x half> %arg0) #0 {
+  store <3 x half> %arg0, <3 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v4f16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9-NOT: v[0:1]
+; GFX9: buffer_store_dwordx2 v[0:1], off
+define void @void_func_v4f16(<4 x half> %arg0) #0 {
+  store <4 x half> %arg0, <4 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v8f16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9: buffer_store_dwordx4 v[0:3], off
+define void @void_func_v8f16(<8 x half> %arg0) #0 {
+  store <8 x half> %arg0, <8 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v16f16:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
+; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
+define void @void_func_v16f16(<16 x half> %arg0) #0 {
+  store <16 x half> %arg0, <16 x half> addrspace(1)* undef
+  ret void
+}
+
+; Make sure there is no alignment requirement for passed vgprs.
+; GCN-LABEL: {{^}}void_func_i32_i64_i32:
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+; GCN: buffer_store_dwordx2 v[1:2]
+; GCN: buffer_store_dword v3
+define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
+  store volatile i32 %arg0, i32 addrspace(1)* undef
+  store volatile i64 %arg1, i64 addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_struct_i32:
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0, off
+define void @void_func_struct_i32({ i32 } %arg0) #0 {
+  store { i32 } %arg0, { i32 } addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_struct_i8_i32:
+; GCN-DAG: buffer_store_byte v0, off
+; GCN-DAG: buffer_store_dword v1, off
+define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
+  store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
+; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v[[ELT1]]
+; GCN-DAG: buffer_store_byte v[[ELT0]]
+define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 {
+  %arg0.load = load { i8, i32 }, { i8, i32 }* %arg0
+  store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
+; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN: ds_write_b32 v0, v0
+; GCN: s_setpc_b64
+define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 {
+  %arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0
+  %arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1
+  store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
+  store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef
+  store volatile i32 %arg2, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
+; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off
+define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 {
+  %arg0.load = load i32, i32* %arg0
+  %arg1.load = load i64, i64* %arg1
+  store i32 %arg0.load, i32 addrspace(1)* undef
+  store i64 %arg1.load, i64 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_i32_i64:
+; GCN-DAG: buffer_store_dwordx4 v[0:3], off
+; GCN-DAG: buffer_store_dwordx4 v[4:7], off
+; GCN-DAG: buffer_store_dwordx4 v[8:11], off
+; GCN-DAG: buffer_store_dwordx4 v[12:15], off
+; GCN-DAG: buffer_store_dwordx4 v[16:19], off
+; GCN-DAG: buffer_store_dwordx4 v[20:23], off
+; GCN-DAG: buffer_store_dwordx4 v[24:27], off
+; GCN-DAG: buffer_store_dwordx4 v[28:31], off
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8
+
+; GCN: buffer_store_dword v[[LOAD_ARG1]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
+define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile i32 %arg1, i32 addrspace(1)* undef
+  store volatile i64 %arg2, i64 addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Different ext load types on CI vs. VI
+; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
+; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
+; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
+
+; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off
+; GCN: buffer_store_byte [[LOAD_ARG2]], off
+; GCN: buffer_store_short [[LOAD_ARG3]], off
+; VI: buffer_store_short [[LOAD_ARG4]], off
+
+; CI: buffer_store_short [[CVT_ARG4]], off
+define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile i1 %arg1, i1 addrspace(1)* undef
+  store volatile i8 %arg2, i8 addrspace(1)* undef
+  store volatile i16 %arg3, i16 addrspace(1)* undef
+  store volatile half %arg4, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
+; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
+define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
+  store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16:
+; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
+; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GFX9: buffer_store_dword [[LOAD_ARG1]], off
+; GFX9: buffer_store_short [[LOAD_ARG2]], off
+define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
+  store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
+  store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
+  store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
+
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off
+; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
+define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
+  store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}}
+define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
+  store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
+  ret void
+}
+
+; Check there is no crash.
+; GCN-LABEL: {{^}}void_func_v16i8:
+define void @void_func_v16i8(<16 x i8> %arg0) #0 {
+  store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
+  ret void
+}
+
+; Check there is no crash.
+; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
+define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/function-returns.ll b/test/CodeGen/AMDGPU/function-returns.ll
new file mode 100644
index 000000000000..f704d43a1742
--- /dev/null
+++ b/test/CodeGen/AMDGPU/function-returns.ll
@@ -0,0 +1,514 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}i1_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define i1 @i1_func_void() #0 {
+  %val = load i1, i1 addrspace(1)* undef
+  ret i1 %val
+}
+
+; FIXME: Missing and?
+; GCN-LABEL: {{^}}i1_zeroext_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define zeroext i1 @i1_zeroext_func_void() #0 {
+  %val = load i1, i1 addrspace(1)* undef
+  ret i1 %val
+}
+
+; GCN-LABEL: {{^}}i1_signext_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
+; GCN-NEXT: s_setpc_b64
+define signext i1 @i1_signext_func_void() #0 {
+  %val = load i1, i1 addrspace(1)* undef
+  ret i1 %val
+}
+
+; GCN-LABEL: {{^}}i8_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i8 @i8_func_void() #0 {
+  %val = load i8, i8 addrspace(1)* undef
+  ret i8 %val
+}
+
+; GCN-LABEL: {{^}}i8_zeroext_func_void:
+; GCN: buffer_load_ubyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define zeroext i8 @i8_zeroext_func_void() #0 {
+  %val = load i8, i8 addrspace(1)* undef
+  ret i8 %val
+}
+
+; GCN-LABEL: {{^}}i8_signext_func_void:
+; GCN: buffer_load_sbyte v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define signext i8 @i8_signext_func_void() #0 {
+  %val = load i8, i8 addrspace(1)* undef
+  ret i8 %val
+}
+
+; GCN-LABEL: {{^}}i16_func_void:
+; GCN: buffer_load_ushort v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i16 @i16_func_void() #0 {
+  %val = load i16, i16 addrspace(1)* undef
+  ret i16 %val
+}
+
+; GCN-LABEL: {{^}}i16_zeroext_func_void:
+; GCN: buffer_load_ushort v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define zeroext i16 @i16_zeroext_func_void() #0 {
+  %val = load i16, i16 addrspace(1)* undef
+  ret i16 %val
+}
+
+; GCN-LABEL: {{^}}i16_signext_func_void:
+; GCN: buffer_load_sshort v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define signext i16 @i16_signext_func_void() #0 {
+  %val = load i16, i16 addrspace(1)* undef
+  ret i16 %val
+}
+
+; GCN-LABEL: {{^}}i32_func_void:
+; GCN: buffer_load_dword v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i32 @i32_func_void() #0 {
+  %val = load i32, i32 addrspace(1)* undef
+  ret i32 %val
+}
+
+; GCN-LABEL: {{^}}i64_func_void:
+; GCN: buffer_load_dwordx2 v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define i64 @i64_func_void() #0 {
+  %val = load i64, i64 addrspace(1)* undef
+  ret i64 %val
+}
+
+; GCN-LABEL: {{^}}f32_func_void:
+; GCN: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define float @f32_func_void() #0 {
+  %val = load float, float addrspace(1)* undef
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}f64_func_void:
+; GCN: buffer_load_dwordx2 v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define double @f64_func_void() #0 {
+  %val = load double, double addrspace(1)* undef
+  ret double %val
+}
+
+; GCN-LABEL: {{^}}v2i32_func_void:
+; GCN: buffer_load_dwordx2 v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <2 x i32> @v2i32_func_void() #0 {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+  ret <2 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v3i32_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <3 x i32> @v3i32_func_void() #0 {
+  %val = load <3 x i32>, <3 x i32> addrspace(1)* undef
+  ret <3 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v4i32_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <4 x i32> @v4i32_func_void() #0 {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+  ret <4 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v5i32_func_void:
+; GCN-DAG: buffer_load_dword v4, off
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <5 x i32> @v5i32_func_void() #0 {
+  %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
+  ret <5 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v8i32_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <8 x i32> @v8i32_func_void() #0 {
+  %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  ret <8 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v16i32_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <16 x i32> @v16i32_func_void() #0 {
+  %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+  ret <16 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v32i32_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <32 x i32> @v32i32_func_void() #0 {
+  %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+  ret <32 x i32> %val
+}
+
+; GCN-LABEL: {{^}}v2i64_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <2 x i64> @v2i64_func_void() #0 {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* undef
+  ret <2 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v3i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <3 x i64> @v3i64_func_void() #0 {
+  %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
+  ret <3 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v4i64_func_void:
+; GCN: buffer_load_dwordx4 v[0:3], off
+; GCN: buffer_load_dwordx4 v[4:7], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <4 x i64> @v4i64_func_void() #0 {
+  %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
+  ret <4 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v5i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <5 x i64> @v5i64_func_void() #0 {
+  %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
+  ret <5 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v8i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <8 x i64> @v8i64_func_void() #0 {
+  %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
+  ret <8 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v16i64_func_void:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], off
+; GCN-DAG: buffer_load_dwordx4 v[4:7], off
+; GCN-DAG: buffer_load_dwordx4 v[8:11], off
+; GCN-DAG: buffer_load_dwordx4 v[12:15], off
+; GCN-DAG: buffer_load_dwordx4 v[16:19], off
+; GCN-DAG: buffer_load_dwordx4 v[20:23], off
+; GCN-DAG: buffer_load_dwordx4 v[24:27], off
+; GCN-DAG: buffer_load_dwordx4 v[28:31], off
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <16 x i64> @v16i64_func_void() #0 {
+  %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
+  ret <16 x i64> %val
+}
+
+; GCN-LABEL: {{^}}v2i16_func_void:
+; GFX9: buffer_load_dword v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @v2i16_func_void() #0 {
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+  ret <2 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v3i16_func_void:
+; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <3 x i16> @v3i16_func_void() #0 {
+  %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
+  ret <3 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v4i16_func_void:
+; GFX9: buffer_load_dwordx2 v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <4 x i16> @v4i16_func_void() #0 {
+  %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
+  ret <4 x i16> %val
+}
+
+; FIXME: Should not scalarize
+; GCN-LABEL: {{^}}v5i16_func_void:
+; GFX9: buffer_load_dwordx2 v[0:1]
+; GFX9: buffer_load_ushort v4
+; GFX9: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9: v_mov_b32_e32 v2, v1
+; GFX9: v_lshrrev_b32_e32 v3, 16, v0
+; GCN: s_setpc_b64
+define <5 x i16> @v5i16_func_void() #0 {
+  %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
+  %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
+  ret <5 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v8i16_func_void:
+; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <8 x i16> @v8i16_func_void() #0 {
+  %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
+  %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  ret <8 x i16> %val
+}
+
+; GCN-LABEL: {{^}}v16i16_func_void:
+; GFX9: buffer_load_dwordx4 v[0:3], off
+; GFX9: buffer_load_dwordx4 v[4:7], off
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64
+define <16 x i16> @v16i16_func_void() #0 {
+  %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
+  ret <16 x i16> %val
+}
+
+; FIXME: Should pack
+; GCN-LABEL: {{^}}v16i8_func_void:
+; GCN-DAG: v12
+; GCN-DAG: v13
+; GCN-DAG: v14
+; GCN-DAG: v15
+define <16 x i8> @v16i8_func_void() #0 {
+  %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
+  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  ret <16 x i8> %val
+}
+
+; FIXME: Should pack
+; GCN-LABEL: {{^}}v4i8_func_void:
+; GCN: buffer_load_dword v0
+; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
+; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
+; CI-DAG: v_bfe_u32 v1, v0, 8, 8
+; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0
+; GCN: s_setpc_b64
+define <4  x i8> @v4i8_func_void() #0 {
+  %ptr = load volatile <4  x i8> addrspace(1)*, <4  x i8> addrspace(1)* addrspace(2)* undef
+  %val = load <4  x i8>, <4  x i8> addrspace(1)* %ptr
+  ret <4  x i8> %val
+}
+
+; GCN-LABEL: {{^}}struct_i8_i32_func_void:
+; GCN-DAG: buffer_load_dword v1
+; GCN-DAG: buffer_load_ubyte v0
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define {i8, i32} @struct_i8_i32_func_void() #0 {
+  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
+  ret { i8, i32 } %val
+}
+
+; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
+; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
+; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}}
+; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}}
+define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 {
+  %val0 = load volatile i8, i8 addrspace(1)* undef
+  %val1 = load volatile i32, i32 addrspace(1)* undef
+  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
+  store i8 %val0, i8* %gep0
+  store i32 %val1, i32* %gep1
+  ret void
+}
+
+; GCN-LABEL: {{^}}v33i32_func_void:
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define <33 x i32> @v33i32_func_void() #0 {
+  %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
+  %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
+  ret <33 x i32> %val
+}
+
+; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
+  %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
+  %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
+  ret { <32 x i32>, i32 }%val
+}
+
+; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}}
+; GCN: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
+  %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
+  %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
+  ret { i32, <32 x i32> }%val
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index d96b796d4495..35aeeeaa225c 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -27,7 +27,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 44
+; ELF: Size: 48
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
@@ -41,14 +41,12 @@
 ; HSA: .p2align 2
 ; HSA: {{^}}simple:
 ; HSA-NOT: amd_kernel_code_t
-
-; FIXME: Check this isn't a kernarg load when calling convention implemented.
-; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; Make sure we are setting the ATC bit:
-; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000
 ; On VI+ we also need to set MTYPE = 2
-; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
+; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000
 ; Make sure we generate flat store for HSA
 ; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 
@@ -56,8 +54,9 @@
 ; HSA: .size   simple, .Lfunc_end0-simple
 ; HSA: ; Function info:
 ; HSA-NOT: COMPUTE_PGM_RSRC2
-define void @simple(i32 addrspace(1)* %out) {
+define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
 entry:
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index b160af86a2b6..4a0213dd1de5 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -12,7 +12,7 @@
 ; SI: s_endpgm
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
-  %tidig = call i32 @llvm.r600.read.tidig.x() #0
+  %tidig = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = trunc i32 %tidig to i1
   br i1 %cmp, label %bb2, label %bb3
 
@@ -32,6 +32,6 @@ bb6:                                              ; preds = %bb4, %bb3
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index 636b45db698d..36441cf778c2 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -191,7 +191,7 @@ entry:
 ; CHECK: v_mov_b32_e32 v0, s0
 ; CHECK: v_mov_b32_e32 v1, s1
 ; CHECK: use v[0:1]
-define void @i64_imm_input_phys_vgpr() {
+define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
 entry:
   call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 56966a19cf7b..1fc77893e7e9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -356,6 +356,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
@@ -371,6 +372,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 
 ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 3d64f93db2e4..eee8351de79b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -207,6 +207,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
@@ -222,6 +223,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 
 ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
 ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 5f8ca28ec5f0..1b937ab93247 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -7,14 +7,13 @@
 ; GFX9: flat_store_dword
 ; GFX9-NOT: s_waitcnt
 ; GCN: s_barrier
-define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
   store i32 %tmp, i32 addrspace(1)* %tmp1
   call void @llvm.amdgcn.s.barrier()
-  %tmp2 = call i32 @llvm.r600.read.local.size.x()
-  %tmp3 = sub i32 %tmp2, 1
+  %tmp3 = sub i32 %size, 1
   %tmp4 = sub i32 %tmp3, %tmp
   %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
   %tmp6 = load i32, i32 addrspace(1)* %tmp5
@@ -24,7 +23,6 @@ entry:
 
 declare void @llvm.amdgcn.s.barrier() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
-declare i32 @llvm.r600.read.local.size.x() #2
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
diff --git a/test/CodeGen/AMDGPU/lshl64-to-32.ll b/test/CodeGen/AMDGPU/lshl64-to-32.ll
new file mode 100644
index 000000000000..5ff6b71c1f02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshl64-to-32.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}zext_shl64_to_32:
+; CHECK: s_lshl_b32
+; CHECK-NOT: s_lshl_b64
+define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
+  %and = and i32 %x, 1073741823
+  %ext = zext i32 %and to i64
+  %shl = shl i64 %ext, 2
+  store i64 %shl, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}sext_shl64_to_32:
+; CHECK: s_lshl_b32
+; CHECK-NOT: s_lshl_b64
+define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
+  %and = and i32 %x, 536870911
+  %ext = sext i32 %and to i64
+  %shl = shl i64 %ext, 2
+  store i64 %shl, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}zext_shl64_overflow:
+; CHECK: s_lshl_b64
+; CHECK-NOT: s_lshl_b32
+define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
+  %and = and i32 %x, 2147483647
+  %ext = zext i32 %and to i64
+  %shl = shl i64 %ext, 2
+  store i64 %shl, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}sext_shl64_overflow:
+; CHECK: s_lshl_b64
+; CHECK-NOT: s_lshl_b32
+define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
+  %and = and i32 %x, 2147483647
+  %ext = sext i32 %and to i64
+  %shl = shl i64 %ext, 2
+  store i64 %shl, i64 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/packed-op-sel.ll b/test/CodeGen/AMDGPU/packed-op-sel.ll
index 6ff0c54c33d0..4970375d40d3 100644
--- a/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -181,8 +181,7 @@ bb:
 ; GCN-NOT: shl
 ; GCN-NOT: or
 
-; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
-; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
+; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
 bb:
   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
@@ -260,6 +259,434 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+  %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
+  %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
+
+  %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_vector_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
+define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
+
+  %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+  %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
+
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+  %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
+  %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
+  %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+; GCN-NOT: xor
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+
+  %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+; GCN-NOT: xor
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+; GCN-NOT: xor
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+; GCN-NOT: xor
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+; GCN-NOT: xor
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bitcast_fneg_f32:
+; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
+define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %f32 = load volatile float, float addrspace(3)* undef, align 4
+  %neg.f32 = fsub float -0.0, %f32
+  %bc = bitcast float %neg.f32 to <2 x half>
+  %result = fadd <2 x half> %vec0, %bc
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
+; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+
+  %f32 = load volatile float, float addrspace(3)* undef, align 4
+  %neg.f32 = fsub float -0.0, %f32
+  %bc = bitcast float %neg.f32 to <2 x half>
+  %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %result = fadd <2 x half> %vec0, %shuf
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}extract_from_i64:
+; GCN: v_lshl_or_b32
+; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
+define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
+bb:
+  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+  %i64 = load volatile i64, i64 addrspace(1)* undef
+
+  %elt0 = trunc i64 %i64 to i16
+  %hi = lshr i64 %i64, 16
+  %elt1 = trunc i64 %hi to i16
+
+  %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
+  %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
+  %result = add <2 x i16> %vec0, %ins1
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; Bitcast is final obstacle to identifying same source register
+; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: _or
+
+; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
+; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %scalar0 = load volatile i16, i16 addrspace(1)* undef
+  %shl = shl i16 %scalar0, 1
+  %shl.bc = bitcast i16 %shl to half
+
+  %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
+  %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; Bitcast is final obstacle to identifying same source register
+; GCN-LABEL: {{^}}mix_elt_types_op_sel:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: _or
+
+; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
+; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %scalar0 = load volatile i16, i16 addrspace(1)* undef
+  %scalar1 = load volatile half, half addrspace(1)* undef
+  %shl = shl i16 %scalar0, 1
+  %shl.bc = bitcast i16 %shl to half
+
+  %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
+
+  %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
+  %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/regcoalesce-prune.mir b/test/CodeGen/AMDGPU/regcoalesce-prune.mir
new file mode 100644
index 000000000000..7ad474bf0ed2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/regcoalesce-prune.mir
@@ -0,0 +1,31 @@
+# RUN: llc -o - %s -mtriple=amdgcn-amd-amdhsa-opencl -run-pass=simple-register-coalescing | FileCheck %s
+---
+# Checks for a bug where subregister liveranges were not properly pruned for
+# an IMPLCITI_DEF that gets removed completely.
+#
+# CHECK-LABEL: name: func
+# IMPLICIT_DEF should be gone without llc hitting assertion failures.
+# CHECK-NOT: IMPLICIT_DEF
+name: func
+tracksRegLiveness: true
+body: |
+  bb.0:
+    undef %5.sub1 = V_MOV_B32_e32 0, implicit %exec
+    %6 = COPY %5
+    S_CBRANCH_VCCZ %bb.2, implicit undef %vcc
+
+  bb.1:
+    %1 : sreg_32_xm0 = S_MOV_B32 0
+    undef %0.sub0 : sreg_64 = COPY %1
+    %0.sub1 = COPY %1
+    %4 : vreg_64 = COPY killed %0
+    %5 : vreg_64 = IMPLICIT_DEF
+    %6 : vreg_64 = COPY killed %4
+
+  bb.2:
+    %2 : vgpr_32 = V_CVT_F32_I32_e32 killed %5.sub1, implicit %exec
+
+  bb.3:
+    %3 : vgpr_32 = V_CVT_F32_I32_e32 killed %6.sub1, implicit %exec
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 1e0ac3807528..73defc17d04f 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -393,3 +393,53 @@ store_label:
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
   ret void
 }
+
+
+; Check that "pulling out" SDWA operands works correctly.
+; GCN-LABEL: {{^}}pulled_out_test:
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_and_b32_sdwa
+; NOSDWA-NOT: v_or_b32_sdwa
+
+; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+
+define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
+entry:
+  %idxprom = ashr exact i64 15, 32
+  %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
+  %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8
+
+  %tmp1 = extractelement <8 x i8> %tmp, i32 0
+  %tmp2 = extractelement <8 x i8> %tmp, i32 1
+  %tmp3 = extractelement <8 x i8> %tmp, i32 2
+  %tmp4 = extractelement <8 x i8> %tmp, i32 3
+  %tmp5 = extractelement <8 x i8> %tmp, i32 4
+  %tmp6 = extractelement <8 x i8> %tmp, i32 5
+  %tmp7 = extractelement <8 x i8> %tmp, i32 6
+  %tmp8 = extractelement <8 x i8> %tmp, i32 7
+
+  %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
+  %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
+  %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
+  %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
+  %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
+  %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
+  %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
+  %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1
+
+  %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  
+  %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
+  store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index 6f5fc6d0f38c..36c33b876919 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -299,10 +299,10 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}and_not_mask_i64:
-; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
-; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
+; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}}
 ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
-; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]]
+; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN-NOT: v[[SHRHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
@@ -360,10 +360,9 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index 1daf4bb33e81..cb40ecf2de1c 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -201,7 +201,8 @@ define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; GCN-LABEL: {{^}}v_lshr_32_i64:
 ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VHI1:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], v[[VHI1]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
 define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index 6642411f7a63..cf9e714ea6d3 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
-; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
diff --git a/test/CodeGen/AMDGPU/subreg_interference.mir b/test/CodeGen/AMDGPU/subreg_interference.mir
index 24d06a576c2a..6fc22c8d189f 100644
--- a/test/CodeGen/AMDGPU/subreg_interference.mir
+++ b/test/CodeGen/AMDGPU/subreg_interference.mir
@@ -1,4 +1,12 @@
 # RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
+--- |
+
+  define amdgpu_kernel void @func0() {
+    ret void
+  }
+
+...
+
 ---
 # We should not detect any interference between v0/v1 here and only allocate
 # sgpr0-sgpr3.
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index e82e548f23cd..135f02ac205a 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -186,7 +186,7 @@ bb12:                                             ; preds = %bb145, %bb
   %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
   %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
   %tmp142 = bitcast float %tmp95 to i32
-  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp143 = icmp sgt i32 %tmp142, %tid
   br i1 %tmp143, label %bb144, label %bb145
 
@@ -593,7 +593,7 @@ bb145:                                            ; preds = %bb12
   br label %bb12
 }
 
-declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 53577dbd76f6..1a0c7fd8e1d6 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -699,3 +699,33 @@ define i32 @test_shufflevector_v4s32_v2s32(i32 %arg1, i32 %arg2, i32 %arg3, i32
   %res = extractelement <2 x i32> %shuffle, i32 0
   ret i32 %res
 }
+
+%struct.v2s32 = type { <2 x i32> }
+
+define i32 @test_constantstruct_v2s32() {
+; CHECK-LABEL: name: test_constantstruct_v2s32
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
+  %vec = extractvalue %struct.v2s32 {<2 x i32><i32 1, i32 2>}, 0
+  %elt = extractelement <2 x i32> %vec, i32 0
+  ret i32 %elt
+}
+
+%struct.v2s32.s32.s32 = type { <2 x i32>, i32, i32 }
+
+define i32 @test_constantstruct_v2s32_s32_s32() {
+; CHECK-LABEL: name: test_constantstruct_v2s32_s32_s32
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32)
+; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK: [[C4:%[0-9]+]](s32) = G_CONSTANT i32 4
+; CHECK: [[CS:%[0-9]+]](s128) = G_SEQUENCE [[VEC]](<2 x s32>), 0, [[C3]](s32), 64, [[C4]](s32), 96
+; CHECK: [[EXT:%[0-9]+]](<2 x s32>) = G_EXTRACT [[CS]](s128), 0
+; CHECK: G_EXTRACT_VECTOR_ELT [[EXT]](<2 x s32>)
+  %vec = extractvalue %struct.v2s32.s32.s32 {<2 x i32><i32 1, i32 2>, i32 3, i32 4}, 0
+  %elt = extractelement <2 x i32> %vec, i32 0
+  ret i32 %elt
+}
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index a44c9721d6c1..1c8142e5ddd5 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -22,7 +22,7 @@ entry:
 ; for.body -> for.cond.backedge (100%)
 ;          -> cond.false.i (0%)
 ; CHECK: BB#1: derived from LLVM BB %for.body
-; CHECK: Successors according to CFG: BB#2(0x7ffffc00 / 0x80000000 = 100.00%) BB#4(0x00000400 / 0x80000000 = 0.00%)
+; CHECK: Successors according to CFG: BB#2(0x80000000 / 0x80000000 = 100.00%) BB#4(0x00000001 / 0x80000000 = 0.00%)
 for.body:
   br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
diff --git a/test/CodeGen/Generic/opt-codegen-no-target-machine.ll b/test/CodeGen/Generic/opt-codegen-no-target-machine.ll
new file mode 100644
index 000000000000..c6cb1c2b657b
--- /dev/null
+++ b/test/CodeGen/Generic/opt-codegen-no-target-machine.ll
@@ -0,0 +1,3 @@
+; RUN: not opt %s -dwarfehprepare -o - 2>&1 | FileCheck %s
+
+; CHECK: Trying to construct TargetPassConfig without a target machine. Scheduling a CodeGen pass without a target triple set?
diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll
new file mode 100644
index 000000000000..be3865703ba2
--- /dev/null
+++ b/test/CodeGen/Mips/dins.ll
@@ -0,0 +1,70 @@
+; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2
+; RUN: llc -O2 -march=mips -mcpu=mips32r2 < %s -o - | FileCheck %s -check-prefix=MIPS32R2
+; RUN: llc -O2 -march=mips -mattr=mips16 < %s -o - | FileCheck %s -check-prefix=MIPS16
+
+; #include <stdint.h>
+; #include <stdio.h>
+; struct cvmx_buf_ptr {
+
+;   struct {
+;     unsigned long long addr :37;
+;     unsigned long long addr1 :15;
+;     unsigned int lenght:14;
+;     uint64_t total_bytes:16;
+;     uint64_t segs : 6;
+;   } s;
+; }
+;
+; unsigned long long foo(volatile struct cvmx_buf_ptr bufptr) {
+;   bufptr.s.addr = 123;
+;   bufptr.s.segs = 4;
+;   bufptr.s.lenght = 5;
+;   bufptr.s.total_bytes = bufptr.s.lenght;
+;   return bufptr.s.addr;
+; }
+
+; Testing of selection INS/DINS instruction
+
+define i64 @f123(i64 inreg %bufptr.coerce0, i64 inreg %bufptr.coerce1) local_unnamed_addr #0 {
+entry:
+  %bufptr.sroa.0 = alloca i64, align 8
+  %bufptr.sroa.4 = alloca i64, align 8
+  store i64 %bufptr.coerce0, i64* %bufptr.sroa.0, align 8
+  store i64 %bufptr.coerce1, i64* %bufptr.sroa.4, align 8
+  %bufptr.sroa.0.0.bufptr.sroa.0.0.bufptr.sroa.0.0.bf.load = load volatile i64, i64* %bufptr.sroa.0, align 8
+  %bf.clear = and i64 %bufptr.sroa.0.0.bufptr.sroa.0.0.bufptr.sroa.0.0.bf.load, 134217727
+  %bf.set = or i64 %bf.clear, 16508780544
+  store volatile i64 %bf.set, i64* %bufptr.sroa.0, align 8
+  %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load2 = load volatile i64, i64* %bufptr.sroa.4, align 8
+  %bf.clear3 = and i64 %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load2, -16911433729
+  %bf.set4 = or i64 %bf.clear3, 1073741824
+  store volatile i64 %bf.set4, i64* %bufptr.sroa.4, align 8
+  %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load6 = load volatile i64, i64* %bufptr.sroa.4, align 8
+  %bf.clear7 = and i64 %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load6, 1125899906842623
+  %bf.set8 = or i64 %bf.clear7, 5629499534213120
+  store volatile i64 %bf.set8, i64* %bufptr.sroa.4, align 8
+  %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load11 = load volatile i64, i64* %bufptr.sroa.4, align 8
+  %bf.lshr = lshr i64 %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load11, 50
+  %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load13 = load volatile i64, i64* %bufptr.sroa.4, align 8
+  %bf.shl = shl nuw nsw i64 %bf.lshr, 34
+  %bf.clear14 = and i64 %bufptr.sroa.4.0.bufptr.sroa.4.0.bufptr.sroa.4.8.bf.load13, -1125882726973441
+  %bf.set15 = or i64 %bf.clear14, %bf.shl
+  store volatile i64 %bf.set15, i64* %bufptr.sroa.4, align 8
+  %bufptr.sroa.0.0.bufptr.sroa.0.0.bufptr.sroa.0.0.bf.load17 = load volatile i64, i64* %bufptr.sroa.0, align 8
+  %bf.lshr18 = lshr i64 %bufptr.sroa.0.0.bufptr.sroa.0.0.bufptr.sroa.0.0.bf.load17, 27
+  ret i64 %bf.lshr18
+}
+
+
+; CHECK-LABEL: f123:
+; MIPS64R2: daddiu	$[[R0:[0-9]+]], $zero, 123
+; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37
+; MIPS64R2: daddiu	$[[R0:[0-9]+]], $zero, 5
+; MIPS64R2: daddiu	$[[R0:[0-9]+]], $zero, 4
+; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
+; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14
+; MIPS64R2: dsrl	  $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50
+; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 34, 16
+; MIPS32R2: ins     $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 16
+; MIPS32R2-NOT: ins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 18, 46
+; MIPS16-NOT: ins{{[[:space:]].*}}
+\ No newline at end of file
diff --git a/test/CodeGen/Mips/micromips-attr.ll b/test/CodeGen/Mips/micromips-attr.ll
new file mode 100644
index 000000000000..78bcc04a9b0c
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-attr.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=mips -mcpu=mips32 --mattr=-micromips < %s | FileCheck %s 
+
+define void @foo() #0 {
+entry:
+  ret void
+}
+; CHECK:        .set    micromips
+; CHECK-NEXT:   .set    nomips16
+; CHECK-NEXT:   .ent    foo
+; CHECK-NEXT: foo:
+
+define void @bar() #1 {
+entry:
+  ret void
+}
+; CHECK:        .set    nomicromips
+; CHECK-NEXT:   .set    nomips16
+; CHECK-NEXT:   .ent    bar
+; CHECK-NEXT: bar:
+
+attributes #0 = {
+  nounwind "micromips"
+  "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
+  "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
+  "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
+  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "use-soft-float"="false"
+}
+
+attributes #1 = {
+  nounwind
+  "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false"
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
+  "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
+  "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
+  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "use-soft-float"="false"
+}
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index a6dafb1abfd6..aa73c522eda5 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -418,18 +418,17 @@ entry:
 declare fp128 @llvm.powi.f128(fp128, i32) #3
 
 ; ALL-LABEL:     libcall2_copysignl:
-; ALL-DAG:      daddiu $[[R2:[0-9]+]], $zero, 1
-; ALL-DAG:      dsll   $[[R3:[0-9]+]], $[[R2]], 63
+; NOT-R2R6-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
+; NOT-R2R6-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 63
 ; ALL-DAG:      ld     $[[R0:[0-9]+]], %got_disp(gld1)
 ; ALL-DAG:      ld     $[[R1:[0-9]+]], 8($[[R0]])
-; ALL-DAG:      and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
+; NOT-R2R6-DAG: and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
 ; ALL-DAG:      ld     $[[R5:[0-9]+]], %got_disp(gld0)
 ; ALL-DAG:      ld     $[[R6:[0-9]+]], 8($[[R5]])
+; R2R6:         dins   $[[R0:[0-9]+]], $[[R1:[0-9]+]], 63, 1
 ; NOT-R2R6-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
 ; NOT-R2R6-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
 ; NOT-R2R6-DAG: or     $4, $[[R8]], $[[R4]]
-; R2R6-DAG:     dextm  $[[R7:[0-9]+]], $[[R6]], 0, 63
-; R2R6-DAG:     or     $4, $[[R7]], $[[R4]]
 ; ALL-DAG:      ld     $2, 0($[[R5]])
 
 define fp128 @libcall2_copysignl() {
diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll
index ecdf55ecdbeb..fb01eb262adc 100644
--- a/test/CodeGen/NVPTX/sched1.ll
+++ b/test/CodeGen/NVPTX/sched1.ll
@@ -6,11 +6,11 @@ define void @foo(i32* %a) {
 ; CHECK: .func foo
 ; CHECK: ld.u32
 ; CHECK-NEXT: ld.u32
-; CHECK-NEXT: add.s32
 ; CHECK-NEXT: ld.u32
-; CHECK-NEXT: add.s32
 ; CHECK-NEXT: ld.u32
 ; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
   %ptr0 = getelementptr i32, i32* %a, i32 0
   %val0 = load i32, i32* %ptr0
   %ptr1 = getelementptr i32, i32* %a, i32 1
diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll
index 347f77c5682c..91ed77878f81 100644
--- a/test/CodeGen/NVPTX/sched2.ll
+++ b/test/CodeGen/NVPTX/sched2.ll
@@ -4,12 +4,12 @@ define void @foo(<2 x i32>* %a) {
 ; CHECK: .func foo
 ; CHECK: ld.v2.u32
 ; CHECK-NEXT: ld.v2.u32
+; CHECK-NEXT: ld.v2.u32
+; CHECK-NEXT: ld.v2.u32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
-; CHECK-NEXT: ld.v2.u32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
-; CHECK-NEXT: ld.v2.u32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
   %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll
index 93b39c1125f8..a86ba1e29d5c 100644
--- a/test/CodeGen/NVPTX/vec8.ll
+++ b/test/CodeGen/NVPTX/vec8.ll
@@ -7,7 +7,7 @@ define void @foo(<8 x i8> %a, i8* %b) {
 ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
 ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
 ; CHECK-DAG: ld.param.u32   %[[B:r[0-9+]]], [foo_param_1]
-; CHECK-DAG: add.s16        [[T:%rs[0-9+]]], [[E1]], [[E6]];
+; CHECK:     add.s16        [[T:%rs[0-9+]]], [[E1]], [[E6]];
 ; CHECK:     st.u8          [%[[B]]], [[T]];
   %t0 = extractelement <8 x i8> %a, i32 1
   %t1 = extractelement <8 x i8> %a, i32 6
diff --git a/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll b/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll
index 64d02c5b9632..2aeb0e1f71f9 100644
--- a/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll
+++ b/test/CodeGen/PowerPC/opt-cmp-inst-cr0-live.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -verify-machineinstrs -print-before=peephole-opt -print-after=peephole-opt -mtriple=powerpc64-unknown-linux-gnu -o /dev/null 2>&1 < %s | FileCheck %s
 
+; CHECK-LABEL: fn1
 define signext i32 @fn1(i32 %baz) {
   %1 = mul nsw i32 %baz, 208
   %2 = zext i32 %1 to i64
@@ -21,3 +22,35 @@ foo:
 bar:
   ret i32 0
 }
+
+; CHECK-LABEL: fn2
+define signext i32 @fn2(i64 %a, i64 %b) {
+; CHECK: OR8o {{[^, ]+}}, {{[^, ]+}}, %CR0<imp-def>;
+; CHECK: [[CREG:[^, ]+]]<def> = COPY %CR0
+; CHECK: BCC 12, [[CREG]]<kill>
+  %1 = or i64 %b, %a
+  %2 = icmp sgt i64 %1, -1
+  br i1 %2, label %foo, label %bar
+
+foo:
+  ret i32 1
+
+bar:
+  ret i32 0
+}
+
+; CHECK-LABEL: fn3
+define signext i32 @fn3(i32 %a) {
+; CHECK: ANDIo {{[^, ]+}}, 10, %CR0<imp-def>;
+; CHECK: [[CREG:[^, ]+]]<def> = COPY %CR0
+; CHECK: BCC 76, [[CREG]]<kill>
+  %1 = and i32 %a, 10
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %foo, label %bar
+
+foo:
+  ret i32 1
+
+bar:
+  ret i32 0
+}
diff --git a/test/CodeGen/PowerPC/shift128.ll b/test/CodeGen/PowerPC/shift128.ll
index 17a380c71c35..48e1b96f838b 100644
--- a/test/CodeGen/PowerPC/shift128.ll
+++ b/test/CodeGen/PowerPC/shift128.ll
@@ -1,14 +1,98 @@
-; RUN: llc -verify-machineinstrs < %s -march=ppc64 | grep sld | count 5
+; RUN: llc -verify-machineinstrs < %s | FileCheck --check-prefix=P8 --check-prefix=CHECK %s
+; RUN: llc -mcpu=pwr9 -verify-machineinstrs < %s | FileCheck --check-prefix=P9 --check-prefix=CHECK %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
 
-define i128 @foo_lshr(i128 %x, i128 %y) {
+; CHECK-LABEL: lshr:
+; CHECK-DAG: subfic [[R0:[0-9]+]], 5, 64
+; CHECK-DAG: addi [[R1:[0-9]+]], 5, -64
+; CHECK-DAG: srd [[R2:[0-9]+]], 3, 5
+; CHECK-DAG: sld [[R3:[0-9]+]], 4, [[R0]]
+; CHECK-DAG: srd [[R4:[0-9]+]], 4, [[R1]]
+; CHECK-DAG: or [[R5:[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG: or 3, [[R5]], [[R4]]
+; CHECK-DAG: srd 4, 4, 5
+; CHECK: blr
+define i128 @lshr(i128 %x, i128 %y) {
   %r = lshr i128 %x, %y
   ret i128 %r
 }
-define i128 @foo_ashr(i128 %x, i128 %y) {
+; CHECK-LABEL: ashr:
+; CHECK-DAG: subfic [[R0:[0-9]+]], 5, 64
+; CHECK-DAG: addi [[R1:[0-9]+]], 5, -64
+; CHECK-DAG: srd [[R2:[0-9]+]], 3, 5
+; CHECK-DAG: sld [[R3:[0-9]+]], 4, [[R0]]
+; CHECK-DAG: srad [[R4:[0-9]+]], 4, [[R1]]
+; CHECK-DAG: or [[R5:[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG: cmpwi [[R1]], 1
+; CHECK-DAG: srad 4, 4, 5
+; CHECK: isel 3, [[R5]], [[R4]], 0
+; CHECK: blr
+define i128 @ashr(i128 %x, i128 %y) {
   %r = ashr i128 %x, %y
   ret i128 %r
 }
-define i128 @foo_shl(i128 %x, i128 %y) {
+; CHECK-LABEL: shl:
+; CHECK-DAG: subfic [[R0:[0-9]+]], 5, 64
+; CHECK-DAG: addi [[R1:[0-9]+]], 5, -64
+; CHECK-DAG: sld [[R2:[0-9]+]], 4, 5
+; CHECK-DAG: srd [[R3:[0-9]+]], 3, [[R0]]
+; CHECK-DAG: sld [[R4:[0-9]+]], 3, [[R1]]
+; CHECK-DAG: or [[R5:[0-9]+]], [[R2]], [[R3]]
+; CHECK-DAG: or 4, [[R5]], [[R4]]
+; CHECK-DAG: sld 3, 3, 5
+; CHECK: blr
+define i128 @shl(i128 %x, i128 %y) {
   %r = shl i128 %x, %y
   ret i128 %r
 }
+
+; CHECK-LABEL: shl_v1i128:
+; P8-NOT: {{\b}}vslo
+; P8-NOT: {{\b}}vsl
+; P9-DAG: vslo
+; P9-DAG: vspltb
+; P9: vsl
+; P9-NOT: {{\b}}sld
+; P9-NOT: {{\b}}srd
+; CHECK: blr
+define i128 @shl_v1i128(i128 %arg, i128 %amt) local_unnamed_addr #0 {
+entry:
+  %0 = insertelement <1 x i128> undef, i128 %arg, i32 0
+  %1 = insertelement <1 x i128> undef, i128 %amt, i32 0
+  %2 = shl <1 x i128> %0, %1
+  %retval = extractelement <1 x i128> %2, i32 0
+  ret i128 %retval
+}
+
+; CHECK-LABEL: lshr_v1i128:
+; P8-NOT: {{\b}}vsro
+; P8-NOT: {{\b}}vsr
+; P9-DAG: vsro
+; P9-DAG: vspltb
+; P9: vsr
+; P9-NOT: {{\b}}srd
+; P9-NOT: {{\b}}sld
+; CHECK: blr
+define i128 @lshr_v1i128(i128 %arg, i128 %amt) local_unnamed_addr #0 {
+entry:
+  %0 = insertelement <1 x i128> undef, i128 %arg, i32 0
+  %1 = insertelement <1 x i128> undef, i128 %amt, i32 0
+  %2 = lshr <1 x i128> %0, %1
+  %retval = extractelement <1 x i128> %2, i32 0
+  ret i128 %retval
+}
+
+; Arithmetic shift right is not available as an operation on the vector registers.
+; CHECK-LABEL: ashr_v1i128:
+; CHECK-NOT: {{\b}}vsro
+; CHECK-NOT: {{\b}}vsr
+; CHECK: blr
+define i128 @ashr_v1i128(i128 %arg, i128 %amt) local_unnamed_addr #0 {
+entry:
+  %0 = insertelement <1 x i128> undef, i128 %arg, i32 0
+  %1 = insertelement <1 x i128> undef, i128 %amt, i32 0
+  %2 = ashr <1 x i128> %0, %1
+  %retval = extractelement <1 x i128> %2, i32 0
+  ret i128 %retval
+}
diff --git a/test/CodeGen/SPARC/LeonItinerariesUT.ll b/test/CodeGen/SPARC/LeonItinerariesUT.ll
index d586fe183a92..87e0c4621c08 100644
--- a/test/CodeGen/SPARC/LeonItinerariesUT.ll
+++ b/test/CodeGen/SPARC/LeonItinerariesUT.ll
@@ -28,8 +28,8 @@
 ; LEON3_4_ITIN-LABEL: f32_ops:
 ; LEON3_4_ITIN:       ld 
 ; LEON3_4_ITIN-NEXT:  ld 
-; LEON3_4_ITIN-NEXT:  fadds 
 ; LEON3_4_ITIN-NEXT:  ld 
+; LEON3_4_ITIN-NEXT:  fadds 
 ; LEON3_4_ITIN-NEXT:  ld 
 ; LEON3_4_ITIN-NEXT:  fsubs 
 ; LEON3_4_ITIN-NEXT:  fmuls 
@@ -47,4 +47,4 @@ entry:
   %6 = fmul float %5, %3
   %7 = fdiv float %6, %4
   ret float %7
-}
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll
index 24aa5b98d0bb..4ec703921e29 100644
--- a/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll
@@ -13,10 +13,10 @@ define float @foo(float %x) nounwind {
 
 ; CHECK: mulss
 ; CHECK: mulss
-; CHECK: addss
 ; CHECK: mulss
-; CHECK: addss
 ; CHECK: mulss
 ; CHECK: addss
+; CHECK: addss
+; CHECK: addss
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 85db1c0e7e7a..55c825464039 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
-; RUN: llc -mtriple=i386-linux-gnu   -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu   -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
 
 define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
 ; X64-LABEL: test_add_i64:
diff --git a/test/CodeGen/X86/GlobalISel/add-vec.ll b/test/CodeGen/X86/GlobalISel/add-vec.ll
new file mode 100644
index 000000000000..679a49d733a2
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/add-vec.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
+
+define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
+; SKX-LABEL: test_add_v16i8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = add <16 x i8> %arg1, %arg2
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
+; SKX-LABEL: test_add_v8i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = add <8 x i16> %arg1, %arg2
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+; SKX-LABEL: test_add_v4i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = add <4 x i32> %arg1, %arg2
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
+; SKX-LABEL: test_add_v2i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = add <2 x i64> %arg1, %arg2
+  ret <2 x i64> %ret
+}
+
+define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
+; SKX-LABEL: test_add_v32i8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = add <32 x i8> %arg1, %arg2
+  ret <32 x i8> %ret
+}
+
+define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
+; SKX-LABEL: test_add_v16i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = add <16 x i16> %arg1, %arg2
+  ret <16 x i16> %ret
+}
+
+define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
+; SKX-LABEL: test_add_v8i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = add <8 x i32> %arg1, %arg2
+  ret <8 x i32> %ret
+}
+
+define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
+; SKX-LABEL: test_add_v4i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = add <4 x i64> %arg1, %arg2
+  ret <4 x i64> %ret
+}
+
+define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
+; SKX-LABEL: test_add_v64i8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = add <64 x i8> %arg1, %arg2
+  ret <64 x i8> %ret
+}
+
+define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
+; SKX-LABEL: test_add_v32i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = add <32 x i16> %arg1, %arg2
+  ret <32 x i16> %ret
+}
+
+define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
+; SKX-LABEL: test_add_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = add <16 x i32> %arg1, %arg2
+  ret <16 x i32> %ret
+}
+
+define <8 x i64> @test_add_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
+; SKX-LABEL: test_add_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = add <8 x i64> %arg1, %arg2
+  ret <8 x i64> %ret
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll
index 1aae1db8ab07..d7ae4435682f 100644
--- a/test/CodeGen/X86/GlobalISel/binop.ll
+++ b/test/CodeGen/X86/GlobalISel/binop.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL
+; RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL
 
 define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
 ; ALL-LABEL: test_sub_i64:
diff --git a/test/CodeGen/X86/GlobalISel/br.ll b/test/CodeGen/X86/GlobalISel/br.ll
index faa6a0350337..387e8797f0cd 100644
--- a/test/CodeGen/X86/GlobalISel/br.ll
+++ b/test/CodeGen/X86/GlobalISel/br.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=x86_64-linux-gnu    -global-isel %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc -O0 -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
 
 define void @uncondbr() {
 ; CHECK-LABEL: uncondbr:
diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll
index c7e4d91ac3c7..997115d4d900 100644
--- a/test/CodeGen/X86/GlobalISel/callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_GISEL
-; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2               < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_ISEL
-; RUN: llc -mtriple=x86_64-linux-gnu             -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_GISEL
-; RUN: llc -mtriple=x86_64-linux-gnu                          < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_ISEL
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2  -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_GISEL
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2               -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_ISEL
+; RUN: llc -mtriple=x86_64-linux-gnu             -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_GISEL
+; RUN: llc -mtriple=x86_64-linux-gnu                          -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_ISEL
 
 define i32 @test_ret_i32() {
 ; X32-LABEL: test_ret_i32:
diff --git a/test/CodeGen/X86/GlobalISel/cmp.ll b/test/CodeGen/X86/GlobalISel/cmp.ll
index 03692bb6b1de..39fee409d785 100644
--- a/test/CodeGen/X86/GlobalISel/cmp.ll
+++ b/test/CodeGen/X86/GlobalISel/cmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=ALL
+; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
 
 define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
 ; ALL-LABEL: test_icmp_eq_i8:
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
index cab043a51f05..b550bb0bc7be 100644
--- a/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 define i8 @const_i8() {
 ; ALL-LABEL: const_i8:
diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index 64cd0e70a4fd..b08ac062fb4b 100644
--- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
 
 ; TODO merge with ext.ll after i64 sext suported on 32bit platform
 
diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll
index 4d4e3b05ca28..27aecd118b38 100644
--- a/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/test/CodeGen/X86/GlobalISel/ext.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i386-linux-gnu      -global-isel < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu      -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
 
 define i32 @test_zext_i1(i32 %a) {
 ; X64-LABEL: test_zext_i1:
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll
index 2bb11adcc3b5..a9ec94defea8 100644
--- a/test/CodeGen/X86/GlobalISel/frameIndex.ll
+++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=x86_64-linux-gnu                 < %s -o - | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i386-linux-gnu      -global-isel < %s -o - | FileCheck %s --check-prefix=X32
-; RUN: llc -mtriple=i386-linux-gnu                   < %s -o - | FileCheck %s --check-prefix=X32
-; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel < %s -o - | FileCheck %s --check-prefix=X32ABI
-; RUN: llc -mtriple=x86_64-linux-gnux32              < %s -o - | FileCheck %s --check-prefix=X32ABI
+; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu                 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu      -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=i386-linux-gnu                   -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32ABI
+; RUN: llc -mtriple=x86_64-linux-gnux32              -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32ABI
 
 define i32* @allocai32() {
 ; X64-LABEL: allocai32:
diff --git a/test/CodeGen/X86/GlobalISel/gep.ll b/test/CodeGen/X86/GlobalISel/gep.ll
index bc5b0152b24a..94da9fb46761 100644
--- a/test/CodeGen/X86/GlobalISel/gep.ll
+++ b/test/CodeGen/X86/GlobalISel/gep.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64_GISEL
-; RUN: llc -mtriple=x86_64-linux-gnu              < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64_GISEL
+; RUN: llc -mtriple=x86_64-linux-gnu              -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 define i32* @test_gep_i8(i32 *%arr, i8 %ind) {
 ; X64_GISEL-LABEL: test_gep_i8:
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir
new file mode 100644
index 000000000000..feba33ac91be
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir
@@ -0,0 +1,119 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+
+--- |
+  define void @test_add_v16i8() {
+    %ret = add <16 x i8> undef, undef
+    ret void
+  }
+
+  define void @test_add_v8i16() {
+    %ret = add <8 x i16> undef, undef
+    ret void
+  }
+
+  define void @test_add_v4i32() {
+    %ret = add <4 x i32> undef, undef
+    ret void
+  }
+
+  define void @test_add_v2i64() {
+    %ret = add <2 x i64> undef, undef
+    ret void
+  }
+...
+---
+name:            test_add_v16i8
+# ALL-LABEL: name:  test_add_v16i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<16 x s8>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<16 x s8>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<16 x s8>) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<16 x s8>) = IMPLICIT_DEF
+    %1(<16 x s8>) = IMPLICIT_DEF
+    %2(<16 x s8>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v8i16
+# ALL-LABEL: name:  test_add_v8i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<8 x s16>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<8 x s16>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<8 x s16>) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<8 x s16>) = IMPLICIT_DEF
+    %1(<8 x s16>) = IMPLICIT_DEF
+    %2(<8 x s16>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v4i32
+# ALL-LABEL: name:  test_add_v4i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<4 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<4 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<4 x s32>) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = IMPLICIT_DEF
+    %1(<4 x s32>) = IMPLICIT_DEF
+    %2(<4 x s32>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v2i64
+# ALL-LABEL: name:  test_add_v2i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<2 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<2 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<2 x s64>) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<2 x s64>) = IMPLICIT_DEF
+    %1(<2 x s64>) = IMPLICIT_DEF
+    %2(<2 x s64>) = G_ADD %0, %1
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
new file mode 100644
index 000000000000..f7dc8031b4f5
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
@@ -0,0 +1,157 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+--- |
+  define void @test_add_v32i8() {
+    %ret = add <32 x i8> undef, undef
+    ret void
+  }
+
+  define void @test_add_v16i16() {
+    %ret = add <16 x i16> undef, undef
+    ret void
+  }
+
+  define void @test_add_v8i32() {
+    %ret = add <8 x i32> undef, undef
+    ret void
+  }
+
+  define void @test_add_v4i64() {
+    %ret = add <4 x i64> undef, undef
+    ret void
+  }
+
+...
+---
+name:            test_add_v32i8
+# ALL-LABEL: name:  test_add_v32i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX1:          %0(<32 x s8>) = IMPLICIT_DEF
+# AVX1-NEXT:     %1(<32 x s8>) = IMPLICIT_DEF
+# AVX1-NEXT:     %3(<16 x s8>), %4(<16 x s8>) = G_UNMERGE_VALUES %0(<32 x s8>)
+# AVX1-NEXT:     %5(<16 x s8>), %6(<16 x s8>) = G_UNMERGE_VALUES %1(<32 x s8>)
+# AVX1-NEXT:     %7(<16 x s8>) = G_ADD %3, %5
+# AVX1-NEXT:     %8(<16 x s8>) = G_ADD %4, %6
+# AVX1-NEXT:     %2(<32 x s8>) = G_MERGE_VALUES %7(<16 x s8>), %8(<16 x s8>)
+# AVX1-NEXT:     RET 0
+#
+# AVX2:          %0(<32 x s8>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<32 x s8>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<32 x s8>) = G_ADD %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<32 x s8>) = IMPLICIT_DEF
+    %1(<32 x s8>) = IMPLICIT_DEF
+    %2(<32 x s8>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v16i16
+# ALL-LABEL: name:  test_add_v16i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX1:          %0(<16 x s16>) = IMPLICIT_DEF
+# AVX1-NEXT:     %1(<16 x s16>) = IMPLICIT_DEF
+# AVX1-NEXT:     %3(<8 x s16>), %4(<8 x s16>) = G_UNMERGE_VALUES %0(<16 x s16>)
+# AVX1-NEXT:     %5(<8 x s16>), %6(<8 x s16>) = G_UNMERGE_VALUES %1(<16 x s16>)
+# AVX1-NEXT:     %7(<8 x s16>) = G_ADD %3, %5
+# AVX1-NEXT:     %8(<8 x s16>) = G_ADD %4, %6
+# AVX1-NEXT:     %2(<16 x s16>) = G_MERGE_VALUES %7(<8 x s16>), %8(<8 x s16>)
+# AVX1-NEXT:     RET 0
+#
+# AVX2:          %0(<16 x s16>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<16 x s16>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<16 x s16>) = G_ADD %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<16 x s16>) = IMPLICIT_DEF
+    %1(<16 x s16>) = IMPLICIT_DEF
+    %2(<16 x s16>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v8i32
+# ALL-LABEL: name:  test_add_v8i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX1:          %0(<8 x s32>) = IMPLICIT_DEF
+# AVX1-NEXT:     %1(<8 x s32>) = IMPLICIT_DEF
+# AVX1-NEXT:     %3(<4 x s32>), %4(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>)
+# AVX1-NEXT:     %5(<4 x s32>), %6(<4 x s32>) = G_UNMERGE_VALUES %1(<8 x s32>)
+# AVX1-NEXT:     %7(<4 x s32>) = G_ADD %3, %5
+# AVX1-NEXT:     %8(<4 x s32>) = G_ADD %4, %6
+# AVX1-NEXT:     %2(<8 x s32>) = G_MERGE_VALUES %7(<4 x s32>), %8(<4 x s32>)
+# AVX1-NEXT:     RET 0
+#
+# AVX2:          %0(<8 x s32>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<8 x s32>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<8 x s32>) = G_ADD %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<8 x s32>) = IMPLICIT_DEF
+    %1(<8 x s32>) = IMPLICIT_DEF
+    %2(<8 x s32>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v4i64
+# ALL-LABEL: name:  test_add_v4i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX1:          %0(<4 x s64>) = IMPLICIT_DEF
+# AVX1-NEXT:     %1(<4 x s64>) = IMPLICIT_DEF
+# AVX1-NEXT:     %3(<2 x s64>), %4(<2 x s64>) = G_UNMERGE_VALUES %0(<4 x s64>)
+# AVX1-NEXT:     %5(<2 x s64>), %6(<2 x s64>) = G_UNMERGE_VALUES %1(<4 x s64>)
+# AVX1-NEXT:     %7(<2 x s64>) = G_ADD %3, %5
+# AVX1-NEXT:     %8(<2 x s64>) = G_ADD %4, %6
+# AVX1-NEXT:     %2(<4 x s64>) = G_MERGE_VALUES %7(<2 x s64>), %8(<2 x s64>)
+# AVX1-NEXT:     RET 0
+#
+# AVX2:          %0(<4 x s64>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<4 x s64>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<4 x s64>) = G_ADD %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<4 x s64>) = IMPLICIT_DEF
+    %1(<4 x s64>) = IMPLICIT_DEF
+    %2(<4 x s64>) = G_ADD %0, %1
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
new file mode 100644
index 000000000000..2b8b51acaa55
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
@@ -0,0 +1,139 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f           -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512bw -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+
+--- |
+  define void @test_add_v64i8() {
+    %ret = add <64 x i8> undef, undef
+    ret void
+  }
+
+  define void @test_add_v32i16() {
+    %ret = add <32 x i16> undef, undef
+    ret void
+  }
+
+  define void @test_add_v16i32() {
+    %ret = add <16 x i32> undef, undef
+    ret void
+  }
+
+  define void @test_add_v8i64() {
+    %ret = add <8 x i64> undef, undef
+    ret void
+  }
+
+...
+---
+name:            test_add_v64i8
+# ALL-LABEL: name:  test_add_v64i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX512F:          %0(<64 x s8>) = IMPLICIT_DEF
+# AVX512F-NEXT:     %1(<64 x s8>) = IMPLICIT_DEF
+# AVX512F-NEXT:     %3(<32 x s8>), %4(<32 x s8>) = G_UNMERGE_VALUES %0(<64 x s8>)
+# AVX512F-NEXT:     %5(<32 x s8>), %6(<32 x s8>) = G_UNMERGE_VALUES %1(<64 x s8>)
+# AVX512F-NEXT:     %7(<32 x s8>) = G_ADD %3, %5
+# AVX512F-NEXT:     %8(<32 x s8>) = G_ADD %4, %6
+# AVX512F-NEXT:     %2(<64 x s8>) = G_MERGE_VALUES %7(<32 x s8>), %8(<32 x s8>)
+# AVX512F-NEXT:     RET 0
+#
+# AVX512BW:          %0(<64 x s8>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %1(<64 x s8>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %2(<64 x s8>) = G_ADD %0, %1
+# AVX512BW-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<64 x s8>) = IMPLICIT_DEF
+    %1(<64 x s8>) = IMPLICIT_DEF
+    %2(<64 x s8>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v32i16
+# ALL-LABEL: name:  test_add_v32i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX512F:          %0(<32 x s16>) = IMPLICIT_DEF
+# AVX512F-NEXT:     %1(<32 x s16>) = IMPLICIT_DEF
+# AVX512F-NEXT:     %3(<16 x s16>), %4(<16 x s16>) = G_UNMERGE_VALUES %0(<32 x s16>)
+# AVX512F-NEXT:     %5(<16 x s16>), %6(<16 x s16>) = G_UNMERGE_VALUES %1(<32 x s16>)
+# AVX512F-NEXT:     %7(<16 x s16>) = G_ADD %3, %5
+# AVX512F-NEXT:     %8(<16 x s16>) = G_ADD %4, %6
+# AVX512F-NEXT:     %2(<32 x s16>) = G_MERGE_VALUES %7(<16 x s16>), %8(<16 x s16>)
+# AVX512F-NEXT:     RET 0
+#
+# AVX512BW:          %0(<32 x s16>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %1(<32 x s16>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %2(<32 x s16>) = G_ADD %0, %1
+# AVX512BW-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<32 x s16>) = IMPLICIT_DEF
+    %1(<32 x s16>) = IMPLICIT_DEF
+    %2(<32 x s16>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v16i32
+# ALL-LABEL: name:  test_add_v16i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<16 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<16 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<16 x s32>) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<16 x s32>) = IMPLICIT_DEF
+    %1(<16 x s32>) = IMPLICIT_DEF
+    %2(<16 x s32>) = G_ADD %0, %1
+    RET 0
+
+...
+---
+name:            test_add_v8i64
+# ALL-LABEL: name:  test_add_v8i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<8 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<8 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<8 x s64>) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<8 x s64>) = IMPLICIT_DEF
+    %1(<8 x s64>) = IMPLICIT_DEF
+    %2(<8 x s64>) = G_ADD %0, %1
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir b/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir
new file mode 100644
index 000000000000..2f90fc9a3c90
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir
@@ -0,0 +1,119 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+
+--- |
+  define void @test_sub_v16i8() {
+    %ret = sub <16 x i8> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v8i16() {
+    %ret = sub <8 x i16> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v4i32() {
+    %ret = sub <4 x i32> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v2i64() {
+    %ret = sub <2 x i64> undef, undef
+    ret void
+  }
+...
+---
+name:            test_sub_v16i8
+# ALL-LABEL: name:  test_sub_v16i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<16 x s8>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<16 x s8>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<16 x s8>) = G_SUB %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<16 x s8>) = IMPLICIT_DEF
+    %1(<16 x s8>) = IMPLICIT_DEF
+    %2(<16 x s8>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v8i16
+# ALL-LABEL: name:  test_sub_v8i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<8 x s16>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<8 x s16>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<8 x s16>) = G_SUB %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<8 x s16>) = IMPLICIT_DEF
+    %1(<8 x s16>) = IMPLICIT_DEF
+    %2(<8 x s16>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v4i32
+# ALL-LABEL: name:  test_sub_v4i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<4 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<4 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<4 x s32>) = G_SUB %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = IMPLICIT_DEF
+    %1(<4 x s32>) = IMPLICIT_DEF
+    %2(<4 x s32>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v2i64
+# ALL-LABEL: name:  test_sub_v2i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<2 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<2 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<2 x s64>) = G_SUB %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<2 x s64>) = IMPLICIT_DEF
+    %1(<2 x s64>) = IMPLICIT_DEF
+    %2(<2 x s64>) = G_SUB %0, %1
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir b/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir
new file mode 100644
index 000000000000..9d07787b8ecb
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir
@@ -0,0 +1,120 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+# TODO: add tests for additional configuration after the legalization supported
+--- |
+  define void @test_sub_v32i8() {
+    %ret = sub <32 x i8> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v16i16() {
+    %ret = sub <16 x i16> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v8i32() {
+    %ret = sub <8 x i32> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v4i64() {
+    %ret = sub <4 x i64> undef, undef
+    ret void
+  }
+
+...
+---
+name:            test_sub_v32i8
+# ALL-LABEL: name:  test_sub_v32i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX2:          %0(<32 x s8>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<32 x s8>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<32 x s8>) = G_SUB %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<32 x s8>) = IMPLICIT_DEF
+    %1(<32 x s8>) = IMPLICIT_DEF
+    %2(<32 x s8>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v16i16
+# ALL-LABEL: name:  test_sub_v16i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX2:          %0(<16 x s16>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<16 x s16>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<16 x s16>) = G_SUB %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<16 x s16>) = IMPLICIT_DEF
+    %1(<16 x s16>) = IMPLICIT_DEF
+    %2(<16 x s16>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v8i32
+# ALL-LABEL: name:  test_sub_v8i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX2:          %0(<8 x s32>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<8 x s32>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<8 x s32>) = G_SUB %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<8 x s32>) = IMPLICIT_DEF
+    %1(<8 x s32>) = IMPLICIT_DEF
+    %2(<8 x s32>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v4i64
+# ALL-LABEL: name:  test_sub_v4i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX2:          %0(<4 x s64>) = IMPLICIT_DEF
+# AVX2-NEXT:     %1(<4 x s64>) = IMPLICIT_DEF
+# AVX2-NEXT:     %2(<4 x s64>) = G_SUB %0, %1
+# AVX2-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<4 x s64>) = IMPLICIT_DEF
+    %1(<4 x s64>) = IMPLICIT_DEF
+    %2(<4 x s64>) = G_SUB %0, %1
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir b/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir
new file mode 100644
index 000000000000..c88e074ca413
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir
@@ -0,0 +1,120 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512bw -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+# TODO: add tests for additional configuration after the legalization supported
+--- |
+  define void @test_sub_v64i8() {
+    %ret = sub <64 x i8> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v32i16() {
+    %ret = sub <32 x i16> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v16i32() {
+    %ret = sub <16 x i32> undef, undef
+    ret void
+  }
+
+  define void @test_sub_v8i64() {
+    %ret = sub <8 x i64> undef, undef
+    ret void
+  }
+
+...
+---
+name:            test_sub_v64i8
+# ALL-LABEL: name:  test_sub_v64i8
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX512BW:          %0(<64 x s8>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %1(<64 x s8>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %2(<64 x s8>) = G_SUB %0, %1
+# AVX512BW-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<64 x s8>) = IMPLICIT_DEF
+    %1(<64 x s8>) = IMPLICIT_DEF
+    %2(<64 x s8>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v32i16
+# ALL-LABEL: name:  test_sub_v32i16
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# AVX512BW:          %0(<32 x s16>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %1(<32 x s16>) = IMPLICIT_DEF
+# AVX512BW-NEXT:     %2(<32 x s16>) = G_SUB %0, %1
+# AVX512BW-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<32 x s16>) = IMPLICIT_DEF
+    %1(<32 x s16>) = IMPLICIT_DEF
+    %2(<32 x s16>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v16i32
+# ALL-LABEL: name:  test_sub_v16i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<16 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<16 x s32>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<16 x s32>) = G_SUB %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<16 x s32>) = IMPLICIT_DEF
+    %1(<16 x s32>) = IMPLICIT_DEF
+    %2(<16 x s32>) = G_SUB %0, %1
+    RET 0
+
+...
+---
+name:            test_sub_v8i64
+# ALL-LABEL: name:  test_sub_v8i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# ALL:          %0(<8 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %1(<8 x s64>) = IMPLICIT_DEF
+# ALL-NEXT:     %2(<8 x s64>) = G_SUB %0, %1
+# ALL-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<8 x s64>) = IMPLICIT_DEF
+    %1(<8 x s64>) = IMPLICIT_DEF
+    %2(<8 x s64>) = G_SUB %0, %1
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
index 49a7fd79f8b2..5df52c5a058b 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-linux-gnu                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
-; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
+; RUN: llc -mtriple=i386-linux-gnu                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
+; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
 
 ;TODO merge with x86-64 tests (many operations not suppored yet)
 
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
index 3e45a9c9a49d..d3d4b297a802 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY
+; RUN: llc -mtriple=x86_64-linux-gnu                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY
 
 define i8 @test_load_i8(i8 * %p1) {
 ; ALL-LABEL: test_load_i8:
diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll
index e218fded4d5f..f1ffc15f4d03 100644
--- a/test/CodeGen/X86/GlobalISel/memop-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
 
 define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
 ; ALL-LABEL: test_load_v4i32_noalign:
diff --git a/test/CodeGen/X86/GlobalISel/mul-scalar.ll b/test/CodeGen/X86/GlobalISel/mul-scalar.ll
index 529e81c43304..450c3839797c 100644
--- a/test/CodeGen/X86/GlobalISel/mul-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/mul-scalar.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 ;TODO: instruction selection not supported yet
 ;define i8 @test_mul_i8(i8 %arg1, i8 %arg2) {
diff --git a/test/CodeGen/X86/GlobalISel/mul-vec.ll b/test/CodeGen/X86/GlobalISel/mul-vec.ll
index 83615a718528..b2e211470f39 100644
--- a/test/CodeGen/X86/GlobalISel/mul-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/mul-vec.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel < %s -o - | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
 
 define <8 x i16> @test_mul_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
 ; SKX-LABEL: test_mul_v8i16:
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
index 446db56b992c..f925c836f3d1 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
@@ -5,6 +5,15 @@
   define void @test_mul_vec256() {
     ret void
   }
+
+  define void @test_add_vec256() {
+    ret void
+  }
+
+  define void @test_sub_vec256() {
+    ret void
+  }
+
 ...
 ---
 name:            test_mul_vec256
@@ -29,3 +38,49 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_add_vec256
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_vec256
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+
+    %0(<8 x s32>) = IMPLICIT_DEF
+    %1(<8 x s32>) = G_ADD %0, %0
+    RET 0
+
+...
+---
+name:            test_sub_vec256
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_sub_vec256
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+
+    %0(<8 x s32>) = IMPLICIT_DEF
+    %1(<8 x s32>) = G_SUB %0, %0
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
index f824ee12dcfb..e0c12ff44a2f 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
@@ -7,6 +7,14 @@
     ret void
   }
 
+  define void @test_add_vec512() {
+    ret void
+  }
+
+  define void @test_sub_vec512() {
+    ret void
+  }
+
 ...
 ---
 name:            test_mul_vec512
@@ -31,3 +39,49 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_add_vec512
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_vec512
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+
+    %0(<16 x s32>) = IMPLICIT_DEF
+    %1(<16 x s32>) = G_ADD %0, %0
+    RET 0
+
+...
+---
+name:            test_sub_vec512
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_sub_vec512
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+
+    %0(<16 x s32>) = IMPLICIT_DEF
+    %1(<16 x s32>) = G_SUB %0, %0
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-add-v128.mir b/test/CodeGen/X86/GlobalISel/select-add-v128.mir
new file mode 100644
index 000000000000..a39702340bc2
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-add-v128.mir
@@ -0,0 +1,195 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2                        -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=SSE2
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                         -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=AVX1
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl           -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BWVL
+
+--- |
+  define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
+    %ret = add <16 x i8> %arg1, %arg2
+    ret <16 x i8> %ret
+  }
+
+  define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
+    %ret = add <8 x i16> %arg1, %arg2
+    ret <8 x i16> %ret
+  }
+
+  define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+    %ret = add <4 x i32> %arg1, %arg2
+    ret <4 x i32> %ret
+  }
+
+  define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
+    %ret = add <2 x i64> %arg1, %arg2
+    ret <2 x i64> %ret
+  }
+
+...
+---
+name:            test_add_v16i8
+# ALL-LABEL: name:  test_add_v16i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128 }
+# AVX512VL-NEXT:     - { id: 1, class: vr128 }
+# AVX512VL-NEXT:     - { id: 2, class: vr128 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PADDBrr %0, %1
+#
+# AVX1:                %2 = VPADDBrr %0, %1
+#
+# AVX512VL:            %2 = VPADDBrr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDBZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<16 x s8>) = COPY %xmm0
+    %1(<16 x s8>) = COPY %xmm1
+    %2(<16 x s8>) = G_ADD %0, %1
+    %xmm0 = COPY %2(<16 x s8>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_v8i16
+# ALL-LABEL: name:  test_add_v8i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128 }
+# AVX512VL-NEXT:     - { id: 1, class: vr128 }
+# AVX512VL-NEXT:     - { id: 2, class: vr128 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PADDWrr %0, %1
+#
+# AVX1:                %2 = VPADDWrr %0, %1
+#
+# AVX512VL:            %2 = VPADDWrr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDWZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<8 x s16>) = COPY %xmm0
+    %1(<8 x s16>) = COPY %xmm1
+    %2(<8 x s16>) = G_ADD %0, %1
+    %xmm0 = COPY %2(<8 x s16>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_v4i32
+# ALL-LABEL: name:  test_add_v4i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128x }
+# AVX512VL-NEXT:     - { id: 1, class: vr128x }
+# AVX512VL-NEXT:     - { id: 2, class: vr128x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PADDDrr %0, %1
+#
+# AVX1:                %2 = VPADDDrr %0, %1
+#
+# AVX512VL:            %2 = VPADDDZ128rr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDDZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_ADD %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_v2i64
+# ALL-LABEL: name:  test_add_v2i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128x }
+# AVX512VL-NEXT:     - { id: 1, class: vr128x }
+# AVX512VL-NEXT:     - { id: 2, class: vr128x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PADDQrr %0, %1
+#
+# AVX1:                %2 = VPADDQrr %0, %1
+#
+# AVX512VL:            %2 = VPADDQZ128rr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDQZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<2 x s64>) = COPY %xmm0
+    %1(<2 x s64>) = COPY %xmm1
+    %2(<2 x s64>) = G_ADD %0, %1
+    %xmm0 = COPY %2(<2 x s64>)
+    RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-add-v256.mir b/test/CodeGen/X86/GlobalISel/select-add-v256.mir
new file mode 100644
index 000000000000..7556c2104124
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-add-v256.mir
@@ -0,0 +1,185 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx2                        -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl           -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BWVL
+
+--- |
+  define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
+    %ret = add <32 x i8> %arg1, %arg2
+    ret <32 x i8> %ret
+  }
+
+  define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
+    %ret = add <16 x i16> %arg1, %arg2
+    ret <16 x i16> %ret
+  }
+
+  define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
+    %ret = add <8 x i32> %arg1, %arg2
+    ret <8 x i32> %ret
+  }
+
+  define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
+    %ret = add <4 x i64> %arg1, %arg2
+    ret <4 x i64> %ret
+  }
+...
+---
+name:            test_add_v32i8
+# ALL-LABEL: name:  test_add_v32i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256 }
+# AVX512VL-NEXT:     - { id: 1, class: vr256 }
+# AVX512VL-NEXT:     - { id: 2, class: vr256 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPADDBYrr %0, %1
+#
+# AVX512VL:            %2 = VPADDBYrr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDBZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<32 x s8>) = COPY %ymm0
+    %1(<32 x s8>) = COPY %ymm1
+    %2(<32 x s8>) = G_ADD %0, %1
+    %ymm0 = COPY %2(<32 x s8>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_add_v16i16
+# ALL-LABEL: name:  test_add_v16i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256 }
+# AVX512VL-NEXT:     - { id: 1, class: vr256 }
+# AVX512VL-NEXT:     - { id: 2, class: vr256 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPADDWYrr %0, %1
+#
+# AVX512VL:            %2 = VPADDWYrr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDWZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<16 x s16>) = COPY %ymm0
+    %1(<16 x s16>) = COPY %ymm1
+    %2(<16 x s16>) = G_ADD %0, %1
+    %ymm0 = COPY %2(<16 x s16>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_add_v8i32
+# ALL-LABEL: name:  test_add_v8i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256x }
+# AVX512VL-NEXT:     - { id: 1, class: vr256x }
+# AVX512VL-NEXT:     - { id: 2, class: vr256x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPADDDYrr %0, %1
+#
+# AVX512VL:            %2 = VPADDDZ256rr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDDZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(<8 x s32>) = COPY %ymm1
+    %2(<8 x s32>) = G_ADD %0, %1
+    %ymm0 = COPY %2(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_add_v4i64
+# ALL-LABEL: name:  test_add_v4i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256x }
+# AVX512VL-NEXT:     - { id: 1, class: vr256x }
+# AVX512VL-NEXT:     - { id: 2, class: vr256x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPADDQYrr %0, %1
+#
+# AVX512VL:            %2 = VPADDQZ256rr %0, %1
+#
+# AVX512BWVL:          %2 = VPADDQZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<4 x s64>) = COPY %ymm0
+    %1(<4 x s64>) = COPY %ymm1
+    %2(<4 x s64>) = G_ADD %0, %1
+    %ymm0 = COPY %2(<4 x s64>)
+    RET 0, implicit %ymm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-add-v512.mir b/test/CodeGen/X86/GlobalISel/select-add-v512.mir
new file mode 100644
index 000000000000..e90be4e996f8
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-add-v512.mir
@@ -0,0 +1,130 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+
+--- |
+  define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) #0 {
+    %ret = add <64 x i8> %arg1, %arg2
+    ret <64 x i8> %ret
+  }
+
+  define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) #0 {
+    %ret = add <32 x i16> %arg1, %arg2
+    ret <32 x i16> %ret
+  }
+
+  define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) #1 {
+    %ret = add <16 x i32> %arg1, %arg2
+    ret <16 x i32> %ret
+  }
+
+  define <8 x i64> @test_add_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) #1 {
+    %ret = add <8 x i64> %arg1, %arg2
+    ret <8 x i64> %ret
+  }
+
+  attributes #0 = { "target-features"="+avx512f,+avx512bw" }
+  attributes #1 = { "target-features"="+avx512f" }
+...
+---
+name:            test_add_v64i8
+# ALL-LABEL: name:  test_add_v64i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPADDBZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<64 x s8>) = COPY %zmm0
+    %1(<64 x s8>) = COPY %zmm1
+    %2(<64 x s8>) = G_ADD %0, %1
+    %zmm0 = COPY %2(<64 x s8>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_add_v32i16
+# ALL-LABEL: name:  test_add_v32i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPADDWZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<32 x s16>) = COPY %zmm0
+    %1(<32 x s16>) = COPY %zmm1
+    %2(<32 x s16>) = G_ADD %0, %1
+    %zmm0 = COPY %2(<32 x s16>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_add_v16i32
+# ALL-LABEL: name:  test_add_v16i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPADDDZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(<16 x s32>) = COPY %zmm1
+    %2(<16 x s32>) = G_ADD %0, %1
+    %zmm0 = COPY %2(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_add_v8i64
+# ALL-LABEL: name:  test_add_v8i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPADDQZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<8 x s64>) = COPY %zmm0
+    %1(<8 x s64>) = COPY %zmm1
+    %2(<8 x s64>) = G_ADD %0, %1
+    %zmm0 = COPY %2(<8 x s64>)
+    RET 0, implicit %zmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-br.mir b/test/CodeGen/X86/GlobalISel/select-br.mir
index 6d8cd2b1367d..9d2a878e7575 100644
--- a/test/CodeGen/X86/GlobalISel/select-br.mir
+++ b/test/CodeGen/X86/GlobalISel/select-br.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-# RUN: llc -mtriple=i386-linux-gnu      -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
 
 --- |
   define void @uncondbr() {
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
index 1d3da6cb88b9..a92c388c1db9 100644
--- a/test/CodeGen/X86/GlobalISel/select-cmp.mir
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
 
 --- |
   define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
diff --git a/test/CodeGen/X86/GlobalISel/select-constant.mir b/test/CodeGen/X86/GlobalISel/select-constant.mir
index f6b97b578b92..162de0264435 100644
--- a/test/CodeGen/X86/GlobalISel/select-constant.mir
+++ b/test/CodeGen/X86/GlobalISel/select-constant.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
 
 --- |
   define i8 @const_i8() {
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index 0844701487bc..d1a3abfd0f93 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 --- |
   define i64 @test_zext_i1(i8 %a) {
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index 831d6efb75f1..dccc20e57100 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=i386-linux-gnu   -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu   -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 --- |
   define i32 @test_zext_i1(i1 %a) {
diff --git a/test/CodeGen/X86/GlobalISel/select-frameIndex.mir b/test/CodeGen/X86/GlobalISel/select-frameIndex.mir
index 2fa9ac23a7af..1d641ba279af 100644
--- a/test/CodeGen/X86/GlobalISel/select-frameIndex.mir
+++ b/test/CodeGen/X86/GlobalISel/select-frameIndex.mir
@@ -1,6 +1,6 @@
-# RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-# RUN: llc -mtriple=i386-linux-gnu      -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ABI
+# RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ABI
 
 --- |
   define i32* @allocai32() {
diff --git a/test/CodeGen/X86/GlobalISel/select-gep.mir b/test/CodeGen/X86/GlobalISel/select-gep.mir
index 2c89b7057c3d..c8a4dc80cb2c 100644
--- a/test/CodeGen/X86/GlobalISel/select-gep.mir
+++ b/test/CodeGen/X86/GlobalISel/select-gep.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
 
 --- |
   define i32* @test_gep_i32(i32* %arr) {
diff --git a/test/CodeGen/X86/GlobalISel/select-sub-v128.mir b/test/CodeGen/X86/GlobalISel/select-sub-v128.mir
new file mode 100644
index 000000000000..d60d4155e29d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-sub-v128.mir
@@ -0,0 +1,195 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2                        -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=SSE2
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                         -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=AVX1
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl           -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BWVL
+
+--- |
+  define <16 x i8> @test_sub_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
+    %ret = sub <16 x i8> %arg1, %arg2
+    ret <16 x i8> %ret
+  }
+
+  define <8 x i16> @test_sub_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
+    %ret = sub <8 x i16> %arg1, %arg2
+    ret <8 x i16> %ret
+  }
+
+  define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+    %ret = sub <4 x i32> %arg1, %arg2
+    ret <4 x i32> %ret
+  }
+
+  define <2 x i64> @test_sub_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
+    %ret = sub <2 x i64> %arg1, %arg2
+    ret <2 x i64> %ret
+  }
+
+...
+---
+name:            test_sub_v16i8
+# ALL-LABEL: name:  test_sub_v16i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128 }
+# AVX512VL-NEXT:     - { id: 1, class: vr128 }
+# AVX512VL-NEXT:     - { id: 2, class: vr128 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PSUBBrr %0, %1
+#
+# AVX1:                %2 = VPSUBBrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBBrr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBBZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<16 x s8>) = COPY %xmm0
+    %1(<16 x s8>) = COPY %xmm1
+    %2(<16 x s8>) = G_SUB %0, %1
+    %xmm0 = COPY %2(<16 x s8>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_v8i16
+# ALL-LABEL: name:  test_sub_v8i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128 }
+# AVX512VL-NEXT:     - { id: 1, class: vr128 }
+# AVX512VL-NEXT:     - { id: 2, class: vr128 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PSUBWrr %0, %1
+#
+# AVX1:                %2 = VPSUBWrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBWrr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBWZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<8 x s16>) = COPY %xmm0
+    %1(<8 x s16>) = COPY %xmm1
+    %2(<8 x s16>) = G_SUB %0, %1
+    %xmm0 = COPY %2(<8 x s16>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_v4i32
+# ALL-LABEL: name:  test_sub_v4i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128x }
+# AVX512VL-NEXT:     - { id: 1, class: vr128x }
+# AVX512VL-NEXT:     - { id: 2, class: vr128x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PSUBDrr %0, %1
+#
+# AVX1:                %2 = VPSUBDrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBDZ128rr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBDZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_SUB %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_v2i64
+# ALL-LABEL: name:  test_sub_v2i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NOVL:            registers:
+# NOVL-NEXT:         - { id: 0, class: vr128 }
+# NOVL-NEXT:         - { id: 1, class: vr128 }
+# NOVL-NEXT:         - { id: 2, class: vr128 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr128x }
+# AVX512VL-NEXT:     - { id: 1, class: vr128x }
+# AVX512VL-NEXT:     - { id: 2, class: vr128x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr128x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# SSE2:                %2 = PSUBQrr %0, %1
+#
+# AVX1:                %2 = VPSUBQrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBQZ128rr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBQZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<2 x s64>) = COPY %xmm0
+    %1(<2 x s64>) = COPY %xmm1
+    %2(<2 x s64>) = G_SUB %0, %1
+    %xmm0 = COPY %2(<2 x s64>)
+    RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-sub-v256.mir b/test/CodeGen/X86/GlobalISel/select-sub-v256.mir
new file mode 100644
index 000000000000..fbc44997b4a2
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-sub-v256.mir
@@ -0,0 +1,185 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx2                        -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl           -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BWVL
+
+--- |
+  define <32 x i8> @test_sub_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
+    %ret = sub <32 x i8> %arg1, %arg2
+    ret <32 x i8> %ret
+  }
+
+  define <16 x i16> @test_sub_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
+    %ret = sub <16 x i16> %arg1, %arg2
+    ret <16 x i16> %ret
+  }
+
+  define <8 x i32> @test_sub_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
+    %ret = sub <8 x i32> %arg1, %arg2
+    ret <8 x i32> %ret
+  }
+
+  define <4 x i64> @test_sub_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
+    %ret = sub <4 x i64> %arg1, %arg2
+    ret <4 x i64> %ret
+  }
+...
+---
+name:            test_sub_v32i8
+# ALL-LABEL: name:  test_sub_v32i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256 }
+# AVX512VL-NEXT:     - { id: 1, class: vr256 }
+# AVX512VL-NEXT:     - { id: 2, class: vr256 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPSUBBYrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBBYrr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBBZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<32 x s8>) = COPY %ymm0
+    %1(<32 x s8>) = COPY %ymm1
+    %2(<32 x s8>) = G_SUB %0, %1
+    %ymm0 = COPY %2(<32 x s8>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_sub_v16i16
+# ALL-LABEL: name:  test_sub_v16i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256 }
+# AVX512VL-NEXT:     - { id: 1, class: vr256 }
+# AVX512VL-NEXT:     - { id: 2, class: vr256 }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPSUBWYrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBWYrr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBWZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<16 x s16>) = COPY %ymm0
+    %1(<16 x s16>) = COPY %ymm1
+    %2(<16 x s16>) = G_SUB %0, %1
+    %ymm0 = COPY %2(<16 x s16>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_sub_v8i32
+# ALL-LABEL: name:  test_sub_v8i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256x }
+# AVX512VL-NEXT:     - { id: 1, class: vr256x }
+# AVX512VL-NEXT:     - { id: 2, class: vr256x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPSUBDYrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBDZ256rr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBDZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(<8 x s32>) = COPY %ymm1
+    %2(<8 x s32>) = G_SUB %0, %1
+    %ymm0 = COPY %2(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_sub_v4i64
+# ALL-LABEL: name:  test_sub_v4i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX2:            registers:
+# AVX2-NEXT:         - { id: 0, class: vr256 }
+# AVX2-NEXT:         - { id: 1, class: vr256 }
+# AVX2-NEXT:         - { id: 2, class: vr256 }
+#
+# AVX512VL:        registers:
+# AVX512VL-NEXT:     - { id: 0, class: vr256x }
+# AVX512VL-NEXT:     - { id: 1, class: vr256x }
+# AVX512VL-NEXT:     - { id: 2, class: vr256x }
+#
+# AVX512BWVL:      registers:
+# AVX512BWVL-NEXT:   - { id: 0, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 1, class: vr256x }
+# AVX512BWVL-NEXT:   - { id: 2, class: vr256x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# AVX2:                %2 = VPSUBQYrr %0, %1
+#
+# AVX512VL:            %2 = VPSUBQZ256rr %0, %1
+#
+# AVX512BWVL:          %2 = VPSUBQZ256rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %ymm0, %ymm1
+
+    %0(<4 x s64>) = COPY %ymm0
+    %1(<4 x s64>) = COPY %ymm1
+    %2(<4 x s64>) = G_SUB %0, %1
+    %ymm0 = COPY %2(<4 x s64>)
+    RET 0, implicit %ymm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-sub-v512.mir b/test/CodeGen/X86/GlobalISel/select-sub-v512.mir
new file mode 100644
index 000000000000..dcd05f056949
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-sub-v512.mir
@@ -0,0 +1,130 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+
+--- |
+  define <64 x i8> @test_sub_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) #0 {
+    %ret = sub <64 x i8> %arg1, %arg2
+    ret <64 x i8> %ret
+  }
+
+  define <32 x i16> @test_sub_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) #0 {
+    %ret = sub <32 x i16> %arg1, %arg2
+    ret <32 x i16> %ret
+  }
+
+  define <16 x i32> @test_sub_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) #1 {
+    %ret = sub <16 x i32> %arg1, %arg2
+    ret <16 x i32> %ret
+  }
+
+  define <8 x i64> @test_sub_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) #1 {
+    %ret = sub <8 x i64> %arg1, %arg2
+    ret <8 x i64> %ret
+  }
+
+  attributes #0 = { "target-features"="+avx512f,+avx512bw" }
+  attributes #1 = { "target-features"="+avx512f" }
+...
+---
+name:            test_sub_v64i8
+# ALL-LABEL: name:  test_sub_v64i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPSUBBZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<64 x s8>) = COPY %zmm0
+    %1(<64 x s8>) = COPY %zmm1
+    %2(<64 x s8>) = G_SUB %0, %1
+    %zmm0 = COPY %2(<64 x s8>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_sub_v32i16
+# ALL-LABEL: name:  test_sub_v32i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPSUBWZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<32 x s16>) = COPY %zmm0
+    %1(<32 x s16>) = COPY %zmm1
+    %2(<32 x s16>) = G_SUB %0, %1
+    %zmm0 = COPY %2(<32 x s16>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_sub_v16i32
+# ALL-LABEL: name:  test_sub_v16i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPSUBDZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(<16 x s32>) = COPY %zmm1
+    %2(<16 x s32>) = G_SUB %0, %1
+    %zmm0 = COPY %2(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_sub_v8i64
+# ALL-LABEL: name:  test_sub_v8i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:   - { id: 0, class: vr512 }
+# ALL-NEXT:   - { id: 1, class: vr512 }
+# ALL-NEXT:   - { id: 2, class: vr512 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %2 = VPSUBQZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %zmm0, %zmm1
+
+    %0(<8 x s64>) = COPY %zmm0
+    %1(<8 x s64>) = COPY %zmm1
+    %2(<8 x s64>) = G_SUB %0, %1
+    %zmm0 = COPY %2(<8 x s64>)
+    RET 0, implicit %zmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-trunc.mir b/test/CodeGen/X86/GlobalISel/select-trunc.mir
index 714340248ff6..9b90543d6559 100644
--- a/test/CodeGen/X86/GlobalISel/select-trunc.mir
+++ b/test/CodeGen/X86/GlobalISel/select-trunc.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
 --- |
   define i1 @trunc_i32toi1(i32 %a) {
     %r = trunc i32 %a to i1
@@ -33,19 +33,20 @@
 ...
 ---
 name:            trunc_i32toi1
+# CHECK-LABEL: name:  trunc_i32toi1
 alignment:       4
 legalized:       true
 regBankSelected: true
-selected:        false
-# CHECK-LABEL: name:            trunc_i32toi1
-# CHECK: registers:
-# CHECK-NEXT:  - { id: 0, class: gr32 }
-# CHECK-NEXT:  - { id: 1, class: gr8 }
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr8 }
 registers:
   - { id: 0, class: gpr }
   - { id: 1, class: gpr }
-# CHECK:  body:
-# CHECK:    %1 = COPY %0.sub_8
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %0.sub_8bit
+# CHECK-NEXT:     %al = COPY %1
+# CHECK-NEXT:     RET 0, implicit %al
 body:             |
   bb.1 (%ir-block.0):
     liveins: %edi
@@ -58,19 +59,20 @@ body:             |
 ...
 ---
 name:            trunc_i32toi8
+# CHECK-LABEL: name:  trunc_i32toi8
 alignment:       4
 legalized:       true
 regBankSelected: true
-selected:        false
-# CHECK-LABEL: name:            trunc_i32toi8
-# CHECK: registers:
-# CHECK-NEXT:  - { id: 0, class: gr32 }
-# CHECK-NEXT:  - { id: 1, class: gr8 }
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr8 }
 registers:
   - { id: 0, class: gpr }
   - { id: 1, class: gpr }
-# CHECK:  body:
-# CHECK:    %1 = COPY %0.sub_8
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %0.sub_8bit
+# CHECK-NEXT:     %al = COPY %1
+# CHECK-NEXT:     RET 0, implicit %al
 body:             |
   bb.1 (%ir-block.0):
     liveins: %edi
@@ -83,19 +85,20 @@ body:             |
 ...
 ---
 name:            trunc_i32toi16
+# CHECK-LABEL: name:  trunc_i32toi16
 alignment:       4
 legalized:       true
 regBankSelected: true
-selected:        false
-# CHECK-LABEL: name:            trunc_i32toi16
-# CHECK: registers:
-# CHECK-NEXT:  - { id: 0, class: gr32 }
-# CHECK-NEXT:  - { id: 1, class: gr16 }
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr32 }
+# CHECK-NEXT:   - { id: 1, class: gr16 }
 registers:
   - { id: 0, class: gpr }
   - { id: 1, class: gpr }
-# CHECK:  body:
-# CHECK:    %1 = COPY %0.sub_16
+# CHECK:          %0 = COPY %edi
+# CHECK-NEXT:     %1 = COPY %0.sub_16bit
+# CHECK-NEXT:     %ax = COPY %1
+# CHECK-NEXT:     RET 0, implicit %ax
 body:             |
   bb.1 (%ir-block.0):
     liveins: %edi
@@ -108,19 +111,20 @@ body:             |
 ...
 ---
 name:            trunc_i64toi8
+# CHECK-LABEL: name:  trunc_i64toi8
 alignment:       4
 legalized:       true
 regBankSelected: true
-selected:        false
-# CHECK-LABEL: name:            trunc_i64toi8
-# CHECK: registers:
-# CHECK-NEXT:  - { id: 0, class: gr64 }
-# CHECK-NEXT:  - { id: 1, class: gr8 }
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr64_with_sub_8bit }
+# CHECK-NEXT:   - { id: 1, class: gr8 }
 registers:
   - { id: 0, class: gpr }
   - { id: 1, class: gpr }
-# CHECK:  body:
-# CHECK:    %1 = COPY %0.sub_8
+# CHECK:          %0 = COPY %rdi
+# CHECK-NEXT:     %1 = COPY %0.sub_8bit
+# CHECK-NEXT:     %al = COPY %1
+# CHECK-NEXT:     RET 0, implicit %al
 body:             |
   bb.1 (%ir-block.0):
     liveins: %rdi
@@ -133,19 +137,20 @@ body:             |
 ...
 ---
 name:            trunc_i64toi16
+# CHECK-LABEL: name:  trunc_i64toi16
 alignment:       4
 legalized:       true
 regBankSelected: true
-selected:        false
-# CHECK-LABEL: name:            trunc_i64toi16
-# CHECK: registers:
-# CHECK-NEXT:  - { id: 0, class: gr64 }
-# CHECK-NEXT:  - { id: 1, class: gr16 }
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr64 }
+# CHECK-NEXT:   - { id: 1, class: gr16 }
 registers:
   - { id: 0, class: gpr }
   - { id: 1, class: gpr }
-# CHECK:  body:
-# CHECK:    %1 = COPY %0.sub_16
+# CHECK:          %0 = COPY %rdi
+# CHECK-NEXT:     %1 = COPY %0.sub_16bit
+# CHECK-NEXT:     %ax = COPY %1
+# CHECK-NEXT:     RET 0, implicit %ax
 body:             |
   bb.1 (%ir-block.0):
     liveins: %rdi
@@ -158,19 +163,20 @@ body:             |
 ...
 ---
 name:            trunc_i64toi32
+# CHECK-LABEL: name:  trunc_i64toi32
 alignment:       4
 legalized:       true
 regBankSelected: true
-selected:        false
-# CHECK-LABEL: name:            trunc_i64toi32
-# CHECK: registers:
-# CHECK-NEXT:  - { id: 0, class: gr64 }
-# CHECK-NEXT:  - { id: 1, class: gr32 }
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gr64 }
+# CHECK-NEXT:   - { id: 1, class: gr32 }
 registers:
   - { id: 0, class: gpr }
   - { id: 1, class: gpr }
-# CHECK:  body:
-# CHECK:    %1 = COPY %0.sub_32
+# CHECK:          %0 = COPY %rdi
+# CHECK-NEXT:     %1 = COPY %0.sub_32bit
+# CHECK-NEXT:     %eax = COPY %1
+# CHECK-NEXT:     RET 0, implicit %eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: %rdi
diff --git a/test/CodeGen/X86/GlobalISel/sub-vec.ll b/test/CodeGen/X86/GlobalISel/sub-vec.ll
new file mode 100644
index 000000000000..9caf18f0c0c7
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/sub-vec.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
+
+define <16 x i8> @test_sub_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
+; SKX-LABEL: test_sub_v16i8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = sub <16 x i8> %arg1, %arg2
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @test_sub_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
+; SKX-LABEL: test_sub_v8i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = sub <8 x i16> %arg1, %arg2
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+; SKX-LABEL: test_sub_v4i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = sub <4 x i32> %arg1, %arg2
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @test_sub_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
+; SKX-LABEL: test_sub_v2i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
+  %ret = sub <2 x i64> %arg1, %arg2
+  ret <2 x i64> %ret
+}
+
+define <32 x i8> @test_sub_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
+; SKX-LABEL: test_sub_v32i8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = sub <32 x i8> %arg1, %arg2
+  ret <32 x i8> %ret
+}
+
+define <16 x i16> @test_sub_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
+; SKX-LABEL: test_sub_v16i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = sub <16 x i16> %arg1, %arg2
+  ret <16 x i16> %ret
+}
+
+define <8 x i32> @test_sub_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
+; SKX-LABEL: test_sub_v8i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = sub <8 x i32> %arg1, %arg2
+  ret <8 x i32> %ret
+}
+
+define <4 x i64> @test_sub_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
+; SKX-LABEL: test_sub_v4i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %ret = sub <4 x i64> %arg1, %arg2
+  ret <4 x i64> %ret
+}
+
+define <64 x i8> @test_sub_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
+; SKX-LABEL: test_sub_v64i8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = sub <64 x i8> %arg1, %arg2
+  ret <64 x i8> %ret
+}
+
+define <32 x i16> @test_sub_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
+; SKX-LABEL: test_sub_v32i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = sub <32 x i16> %arg1, %arg2
+  ret <32 x i16> %ret
+}
+
+define <16 x i32> @test_sub_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
+; SKX-LABEL: test_sub_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = sub <16 x i32> %arg1, %arg2
+  ret <16 x i32> %ret
+}
+
+define <8 x i64> @test_sub_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
+; SKX-LABEL: test_sub_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %ret = sub <8 x i64> %arg1, %arg2
+  ret <8 x i64> %ret
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/trunc.ll b/test/CodeGen/X86/GlobalISel/trunc.ll
index a56fc3b5a87f..6c0f01673afc 100644
--- a/test/CodeGen/X86/GlobalISel/trunc.ll
+++ b/test/CodeGen/X86/GlobalISel/trunc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
 
 define i1 @trunc_i32toi1(i32 %a) {
 ; CHECK-LABEL: trunc_i32toi1:
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 262cb96ca6d8..874e3e379d8e 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -4,8 +4,8 @@
 
 ; CHECK-LABEL: Pass Arguments:
 ; CHECK-NEXT: Target Library Information
-; CHECK-NEXT: Target Transform Information
 ; CHECK-NEXT: Target Pass Configuration
+; CHECK-NEXT: Target Transform Information
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll
index be550e3fe2d1..3f4ee362e230 100644
--- a/test/CodeGen/X86/addcarry.ll
+++ b/test/CodeGen/X86/addcarry.ll
@@ -86,12 +86,12 @@ entry:
 define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {
 ; CHECK-LABEL: pr31719:
 ; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xorl %r10d, %r10d
 ; CHECK-NEXT:    addq 8(%rsi), %rcx
-; CHECK-NEXT:    sbbq %r10, %r10
-; CHECK-NEXT:    andl $1, %r10d
+; CHECK-NEXT:    setb %r10b
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    addq 16(%rsi), %r8
-; CHECK-NEXT:    sbbq %rax, %rax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    addq 24(%rsi), %r9
 ; CHECK-NEXT:    addq (%rsi), %rdx
 ; CHECK-NEXT:    adcq $0, %rcx
@@ -190,9 +190,9 @@ entry:
 define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: shiftadd:
 ; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    leaq (%rdx,%rcx), %rax
 ; CHECK-NEXT:    addq %rsi, %rdi
-; CHECK-NEXT:    adcq %rcx, %rdx
-; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    adcq $0, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = zext i64 %a to i128
@@ -213,12 +213,12 @@ define %S @readd(%S* nocapture readonly %this, %S %arg.b) {
 ; CHECK-NEXT:    addq (%rsi), %rdx
 ; CHECK-NEXT:    movq 8(%rsi), %r10
 ; CHECK-NEXT:    adcq $0, %r10
-; CHECK-NEXT:    sbbq %rax, %rax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    addq %rcx, %r10
 ; CHECK-NEXT:    adcq 16(%rsi), %rax
-; CHECK-NEXT:    sbbq %rcx, %rcx
-; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    movzbl %cl, %ecx
 ; CHECK-NEXT:    addq %r8, %rax
 ; CHECK-NEXT:    adcq 24(%rsi), %rcx
 ; CHECK-NEXT:    addq %r9, %rcx
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index aa28ef5175ed..2aaf14001758 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -135,87 +135,88 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 ; SSE2-LABEL: avg_v32i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm8
+; SSE2-NEXT:    movdqa (%rdi), %xmm8
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm8, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm11, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm15, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm9
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm11, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm10, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm8, %xmm1
+; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    paddd %xmm9, %xmm13
+; SSE2-NEXT:    paddd %xmm15, %xmm2
+; SSE2-NEXT:    paddd %xmm14, %xmm5
+; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm3
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT:    paddd %xmm4, %xmm9
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm7
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm13
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm7, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm6, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm4, %xmm9
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm9, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm13
+; SSE2-NEXT:    pand %xmm4, %xmm13
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm7, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    packuswb %xmm13, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
@@ -258,183 +259,198 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; SSE2-LABEL: avg_v64i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm6
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm1
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsi), %xmm5
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm13
-; SSE2-NEXT:    movdqa 32(%rsi), %xmm11
+; SSE2-NEXT:    subq $152, %rsp
+; SSE2-NEXT:  .Lcfi0:
+; SSE2-NEXT:    .cfi_def_cfa_offset 160
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm5
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm6
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm15
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm15, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm10, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm5, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm6, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm13, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm4, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm14, %xmm12
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm15, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm15
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsi), %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm15
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm8, %xmm15
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm13
-; SSE2-NEXT:    movdqa %xmm11, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm14, %xmm9
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm5, %xmm9
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm7, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm2, %xmm14
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm1, %xmm11
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm12, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm7
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    paddd %xmm8, %xmm4
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd (%rsp), %xmm11 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    paddd %xmm0, %xmm10
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    paddd %xmm0, %xmm7
 ; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm14
+; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    paddd %xmm0, %xmm6
+; SSE2-NEXT:    paddd %xmm0, %xmm10
+; SSE2-NEXT:    paddd %xmm0, %xmm12
 ; SSE2-NEXT:    paddd %xmm0, %xmm11
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm10
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    packuswb %xmm1, %xmm10
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm10, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm12
-; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    packuswb %xmm12, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm13
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm15
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    packuswb %xmm15, %xmm13
-; SSE2-NEXT:    packuswb %xmm4, %xmm13
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    packuswb %xmm15, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm14
 ; SSE2-NEXT:    psrld $1, %xmm9
 ; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pand %xmm0, %xmm14
+; SSE2-NEXT:    packuswb %xmm9, %xmm14
+; SSE2-NEXT:    packuswb %xmm7, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm13
+; SSE2-NEXT:    pand %xmm0, %xmm13
 ; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm9, %xmm6
+; SSE2-NEXT:    packuswb %xmm13, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm10
+; SSE2-NEXT:    pand %xmm0, %xmm10
+; SSE2-NEXT:    pand %xmm0, %xmm12
+; SSE2-NEXT:    packuswb %xmm10, %xmm12
+; SSE2-NEXT:    packuswb %xmm6, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm11
-; SSE2-NEXT:    psrld $1, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm14
 ; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    packuswb %xmm14, %xmm11
-; SSE2-NEXT:    packuswb %xmm6, %xmm11
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    pand %xmm0, %xmm8
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    packuswb %xmm8, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    packuswb %xmm11, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm5, %xmm7
-; SSE2-NEXT:    packuswb %xmm3, %xmm7
-; SSE2-NEXT:    movdqu %xmm7, (%rax)
-; SSE2-NEXT:    movdqu %xmm11, (%rax)
-; SSE2-NEXT:    movdqu %xmm13, (%rax)
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm12, (%rax)
+; SSE2-NEXT:    movdqu %xmm14, (%rax)
+; SSE2-NEXT:    addq $152, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8:
@@ -448,21 +464,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpaddd %ymm15, %ymm7, %ymm7
+; AVX2-NEXT:    vpaddd %ymm14, %ymm6, %ymm6
+; AVX2-NEXT:    vpaddd %ymm13, %ymm5, %ymm5
+; AVX2-NEXT:    vpaddd %ymm12, %ymm4, %ymm4
+; AVX2-NEXT:    vpaddd %ymm11, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm10, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm9, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm5, %ymm5
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm6, %ymm6
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpaddd %ymm8, %ymm7, %ymm7
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm8
 ; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm9
 ; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm10
@@ -524,13 +540,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpaddd %zmm7, %zmm3, %zmm3
+; AVX512F-NEXT:    vpaddd %zmm6, %zmm2, %zmm2
+; AVX512F-NEXT:    vpaddd %zmm5, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm4
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1
@@ -657,27 +673,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 ; SSE2-LABEL: avg_v16i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm2
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
+; SSE2-NEXT:    movdqa (%rdi), %xmm4
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm5
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm8, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
@@ -739,79 +755,80 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-LABEL: avg_v32i16:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
+; SSE2-NEXT:    movdqa (%rdi), %xmm10
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm9
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm11
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm9
+; SSE2-NEXT:    movdqa (%rsi), %xmm14
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm10, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm9, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm11, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm13
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm5, %xmm6
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm11, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm10, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm13, %xmm4
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    paddd %xmm13, %xmm4
+; SSE2-NEXT:    paddd %xmm11, %xmm2
+; SSE2-NEXT:    paddd %xmm15, %xmm5
+; SSE2-NEXT:    paddd %xmm9, %xmm1
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm14
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    psrld $1, %xmm14
 ; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    pslld $16, %xmm7
 ; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm9
-; SSE2-NEXT:    psrad $16, %xmm9
-; SSE2-NEXT:    packssdw %xmm7, %xmm9
+; SSE2-NEXT:    pslld $16, %xmm14
+; SSE2-NEXT:    psrad $16, %xmm14
+; SSE2-NEXT:    packssdw %xmm7, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm6
 ; SSE2-NEXT:    psrad $16, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    packssdw %xmm6, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    packssdw %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm4
 ; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm3
@@ -820,7 +837,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm9, (%rax)
+; SSE2-NEXT:    movdqu %xmm14, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16:
@@ -830,13 +847,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm4
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
@@ -867,9 +884,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
@@ -1030,87 +1047,88 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 ; SSE2-LABEL: avg_v32i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm8
+; SSE2-NEXT:    movdqa (%rdi), %xmm8
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm8, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm11, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm15, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm9
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm11, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    paddd %xmm10, %xmm7
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm8, %xmm1
+; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    paddd %xmm9, %xmm13
+; SSE2-NEXT:    paddd %xmm15, %xmm2
+; SSE2-NEXT:    paddd %xmm14, %xmm5
+; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm3
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT:    paddd %xmm4, %xmm9
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm7
+; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm5
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm13
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm7, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm6
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm6, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm4, %xmm9
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm9, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm7
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm13
+; SSE2-NEXT:    pand %xmm4, %xmm13
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm7, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    packuswb %xmm13, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
@@ -1494,27 +1512,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 ; SSE2-LABEL: avg_v16i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm2
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
+; SSE2-NEXT:    movdqa (%rdi), %xmm4
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm5
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    movdqa %xmm4, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm7, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm8, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
@@ -1576,79 +1594,80 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-LABEL: avg_v32i16_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm11
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm10
+; SSE2-NEXT:    movdqa (%rdi), %xmm10
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm9
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm11
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm8
-; SSE2-NEXT:    movdqa (%rsi), %xmm9
+; SSE2-NEXT:    movdqa (%rsi), %xmm14
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
 ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm10, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm9, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm11, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm8, %xmm13
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm9, %xmm7
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm4, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm5, %xmm6
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm11, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm12, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm10, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    paddd %xmm13, %xmm4
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    paddd %xmm13, %xmm4
+; SSE2-NEXT:    paddd %xmm11, %xmm2
+; SSE2-NEXT:    paddd %xmm15, %xmm5
+; SSE2-NEXT:    paddd %xmm9, %xmm1
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm14
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    psrld $1, %xmm14
 ; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    pslld $16, %xmm7
 ; SSE2-NEXT:    psrad $16, %xmm7
-; SSE2-NEXT:    pslld $16, %xmm9
-; SSE2-NEXT:    psrad $16, %xmm9
-; SSE2-NEXT:    packssdw %xmm7, %xmm9
+; SSE2-NEXT:    pslld $16, %xmm14
+; SSE2-NEXT:    psrad $16, %xmm14
+; SSE2-NEXT:    packssdw %xmm7, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm6
 ; SSE2-NEXT:    psrad $16, %xmm6
 ; SSE2-NEXT:    pslld $16, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    packssdw %xmm6, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    packssdw %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm4
 ; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm3
@@ -1657,7 +1676,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; SSE2-NEXT:    movdqu %xmm3, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm9, (%rax)
+; SSE2-NEXT:    movdqu %xmm14, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i16_2:
@@ -1667,13 +1686,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm4
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
@@ -1704,9 +1723,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT:    vpaddd %zmm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 3cadbe2a8db3..ff5a2371a145 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -2244,11 +2244,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl
 ; X32:       # BB#0:
 ; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_set_pd:
@@ -2269,19 +2269,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3
 ; X32:       # BB#0:
 ; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X32-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_set_ps:
@@ -2881,10 +2881,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub
 ; X32:       # BB#0:
 ; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
 ;
@@ -2908,16 +2908,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a
 ; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
 ; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 647b7a8f4dfc..341dd867e4ff 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; CHECK-NOT: mov
 ; CHECK: insertps    $48
 ; CHECK: insertps    $48
-; CHECK: vaddps
 ; CHECK: insertps    $48
 ; CHECK: insertps    $48
 ; CHECK: vaddps
 ; CHECK: vaddps
+; CHECK: vaddps
 ; CHECK-NEXT: ret
   %1 = getelementptr inbounds float, float* %fb, i64 %index
   %2 = load float, float* %1, align 4
diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index e29cf09718ad..63b0281a7339 100644
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    vcmpgeps %zmm4, %zmm0, %k0
 ; CHECK-NEXT:    vcmpgeps %zmm4, %zmm1, %k1
+; CHECK-NEXT:    vcmpgeps %zmm4, %zmm2, %k2
+; CHECK-NEXT:    vcmpgeps %zmm4, %zmm3, %k3
 ; CHECK-NEXT:    korw %k1, %k0, %k0
-; CHECK-NEXT:    vcmpgeps %zmm4, %zmm2, %k1
-; CHECK-NEXT:    vcmpgeps %zmm4, %zmm3, %k2
-; CHECK-NEXT:    korw %k2, %k1, %k1
+; CHECK-NEXT:    korw %k3, %k2, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index c1b64743f898..eae7b94f5135 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -47,16 +47,20 @@ l2:
   ret float %c1
 }
 
-; FIXME: Can use vcmpeqss and extract from the mask here in AVX512.
 define i32 @test3(float %a, float %b) {
-; ALL-LABEL: test3:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vucomiss %xmm1, %xmm0
-; ALL-NEXT:    setnp %al
-; ALL-NEXT:    sete %cl
-; ALL-NEXT:    andb %al, %cl
-; ALL-NEXT:    movzbl %cl, %eax
-; ALL-NEXT:    retq
+; KNL-LABEL: test3:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vcmpeqss %xmm1, %xmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test3:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcmpeqss %xmm1, %xmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    retq
 
   %cmp10.i = fcmp oeq float %a, %b
   %conv11.i = zext i1 %cmp10.i to i32
@@ -69,7 +73,7 @@ define float @test5(float %p) #0 {
 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
 ; ALL-NEXT:    jne LBB3_1
-; ALL-NEXT:    jp  LBB3_1
+; ALL-NEXT:    jp LBB3_1
 ; ALL-NEXT:  ## BB#2: ## %return
 ; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
@@ -158,47 +162,22 @@ B:
 }
 
 define i32 @test10(i64 %b, i64 %c, i1 %d) {
-; KNL-LABEL: test10:
-; KNL:       ## BB#0:
-; KNL-NEXT:    andl $1, %edx
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    cmpq %rsi, %rdi
-; KNL-NEXT:    sete %al
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    korw %k1, %k0, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB8_1
-; KNL-NEXT:  ## BB#2: ## %if.end.i
-; KNL-NEXT:    movl $6, %eax
-; KNL-NEXT:    retq
-; KNL-NEXT:  LBB8_1: ## %if.then.i
-; KNL-NEXT:    movl $5, %eax
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test10:
-; SKX:       ## BB#0:
-; SKX-NEXT:    andl $1, %edx
-; SKX-NEXT:    kmovd %edx, %k0
-; SKX-NEXT:    cmpq %rsi, %rdi
-; SKX-NEXT:    sete %al
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    korw %k1, %k0, %k1
-; SKX-NEXT:    kxorw %k1, %k0, %k0
-; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    testb %al, %al
-; SKX-NEXT:    je LBB8_1
-; SKX-NEXT:  ## BB#2: ## %if.end.i
-; SKX-NEXT:    movl $6, %eax
-; SKX-NEXT:    retq
-; SKX-NEXT:  LBB8_1: ## %if.then.i
-; SKX-NEXT:    movl $5, %eax
-; SKX-NEXT:    retq
+; ALL-LABEL: test10:
+; ALL:       ## BB#0:
+; ALL-NEXT:    movl %edx, %eax
+; ALL-NEXT:    andb $1, %al
+; ALL-NEXT:    cmpq %rsi, %rdi
+; ALL-NEXT:    sete %cl
+; ALL-NEXT:    orb %dl, %cl
+; ALL-NEXT:    andb $1, %cl
+; ALL-NEXT:    cmpb %cl, %al
+; ALL-NEXT:    je LBB8_1
+; ALL-NEXT:  ## BB#2: ## %if.end.i
+; ALL-NEXT:    movl $6, %eax
+; ALL-NEXT:    retq
+; ALL-NEXT:  LBB8_1: ## %if.then.i
+; ALL-NEXT:    movl $5, %eax
+; ALL-NEXT:    retq
 
   %cmp8.i = icmp eq i64 %b, %c
   %or1 = or i1 %d, %cmp8.i
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 2b55372f3066..33ac15de9de9 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1552,10 +1552,10 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
 ; NOVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; NOVL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpextrq $1, %xmm0, %rax
+; NOVL-NEXT:    vpextrb $8, %xmm0, %eax
 ; NOVL-NEXT:    andl $1, %eax
 ; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; NOVL-NEXT:    vmovq %xmm0, %rax
+; NOVL-NEXT:    vpextrb $0, %xmm0, %eax
 ; NOVL-NEXT:    andl $1, %eax
 ; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index b31b00e54e83..2145f5fb09a8 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -1434,26 +1434,26 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 define i16 @trunc_i32_to_i1(i32 %a) {
 ; KNL-LABEL: trunc_i32_to_i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    movw $-4, %ax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_i32_to_i1:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    andl $1, %edi
-; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    movw $-4, %ax
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    kshiftrw $1, %k1, %k1
-; SKX-NEXT:    kshiftlw $1, %k1, %k1
-; SKX-NEXT:    korw %k0, %k1, %k0
+; SKX-NEXT:    kmovd %eax, %k0
+; SKX-NEXT:    kshiftrw $1, %k0, %k0
+; SKX-NEXT:    kshiftlw $1, %k0, %k0
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-fsel.ll b/test/CodeGen/X86/avx512-fsel.ll
index a9b8914ee1fe..7777ba795416 100644
--- a/test/CodeGen/X86/avx512-fsel.ll
+++ b/test/CodeGen/X86/avx512-fsel.ll
@@ -10,25 +10,11 @@ define i32 @test(float %a, float %b)  {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:  Lcfi0:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    vucomiss %xmm1, %xmm0
-; CHECK-NEXT:    setp %al
-; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    setnp %dl
-; CHECK-NEXT:    sete %sil
-; CHECK-NEXT:    andb %dl, %sil
-; CHECK-NEXT:    ## implicit-def: %EDI
-; CHECK-NEXT:    movb %sil, %dil
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    orb %al, %cl
-; CHECK-NEXT:    ## implicit-def: %EDI
-; CHECK-NEXT:    movb %cl, %dil
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kmovw %k1, %edi
-; CHECK-NEXT:    movb %dil, %al
-; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    vcmpeqss %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    xorb $-1, %cl
+; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:    jmp LBB0_2
 ; CHECK-NEXT:  LBB0_1: ## %L_0
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index c03623a2f035..4890afec2164 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b
 ; CHECK-NEXT:    kxorw %k0, %k0, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    movw $1, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm3
-; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm4
+; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
 ; CHECK-NEXT:    movw $220, %ax
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT:    vaddps %zmm4, %zmm1, %zmm1
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
   %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
diff --git a/test/CodeGen/X86/avx512-i1test.ll b/test/CodeGen/X86/avx512-i1test.ll
index 69fafdfff9aa..321f26674e1e 100755
--- a/test/CodeGen/X86/avx512-i1test.ll
+++ b/test/CodeGen/X86/avx512-i1test.ll
@@ -66,14 +66,13 @@ L_30:                                             ; preds = %bb51, %L_10
 define i64 @func2(i1 zeroext %i, i32 %j) {
 ; CHECK-LABEL: func2:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    je .LBB1_1
 ; CHECK-NEXT:  # BB#2: # %if.then
 ; CHECK-NEXT:    jmp bar # TAILCALL
 ; CHECK-NEXT:  .LBB1_1: # %return
-; CHECK-NEXT:    orq $-2, %rdi
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    orq $-2, %rax
 ; CHECK-NEXT:    retq
 entry:
   %tobool = icmp eq i32 %j, 0
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 87928348a851..29a5325a0ae9 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -260,8 +260,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
 ; KNL-NEXT:    kshiftlw $11, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    testb $1, %al
 ; KNL-NEXT:    je LBB10_2
 ; KNL-NEXT:  ## BB#1: ## %A
 ; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -276,8 +275,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
 ; SKX-NEXT:    kshiftlw $11, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    testb %al, %al
+; SKX-NEXT:    testb $1, %al
 ; SKX-NEXT:    je LBB10_2
 ; SKX-NEXT:  ## BB#1: ## %A
 ; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -299,13 +297,10 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
 ; KNL-LABEL: test12:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
-; KNL-NEXT:    vpcmpgtq %zmm1, %zmm3, %k1
-; KNL-NEXT:    kunpckbw %k0, %k1, %k0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    testb $1, %al
 ; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
 ; KNL-NEXT:    retq
@@ -313,13 +308,10 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
 ; SKX-LABEL: test12:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
-; SKX-NEXT:    vpcmpgtq %zmm1, %zmm3, %k1
-; SKX-NEXT:    kunpckbw %k0, %k1, %k0
-; SKX-NEXT:    kshiftlw $15, %k0, %k0
-; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kshiftlb $7, %k0, %k0
+; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    testb %al, %al
+; SKX-NEXT:    testb $1, %al
 ; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
 ; SKX-NEXT:    vzeroupper
@@ -335,13 +327,13 @@ define i16 @test13(i32 %a, i32 %b) {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
+; KNL-NEXT:    movw $-4, %cx
+; KNL-NEXT:    kmovw %ecx, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
 ; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    movw $-4, %ax
 ; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; KNL-NEXT:    retq
@@ -350,13 +342,13 @@ define i16 @test13(i32 %a, i32 %b) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
+; SKX-NEXT:    movw $-4, %cx
+; SKX-NEXT:    kmovd %ecx, %k0
+; SKX-NEXT:    kshiftrw $1, %k0, %k0
+; SKX-NEXT:    kshiftlw $1, %k0, %k0
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    movw $-4, %ax
-; SKX-NEXT:    kmovd %eax, %k1
-; SKX-NEXT:    kshiftrw $1, %k1, %k1
-; SKX-NEXT:    kshiftlw $1, %k1, %k1
-; SKX-NEXT:    korw %k0, %k1, %k0
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    retq
@@ -373,8 +365,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
 ; KNL-NEXT:    kshiftlw $11, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    testb $1, %al
 ; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
 ; KNL-NEXT:    retq
@@ -385,8 +376,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
 ; SKX-NEXT:    kshiftlb $3, %k0, %k0
 ; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    testb %al, %al
+; SKX-NEXT:    testb $1, %al
 ; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
 ; SKX-NEXT:    vzeroupper
@@ -424,14 +414,13 @@ define i16 @test15(i1 *%addr) {
 define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL-LABEL: test16:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movzbl (%rdi), %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw %esi, %k2
+; KNL-NEXT:    movb (%rdi), %al
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; KNL-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; KNL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -440,14 +429,13 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ;
 ; SKX-LABEL: test16:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movzbl (%rdi), %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    movb (%rdi), %al
+; SKX-NEXT:    kmovd %esi, %k0
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpmovm2d %k1, %zmm0
 ; SKX-NEXT:    vpmovm2d %k0, %zmm1
 ; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; SKX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; SKX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
@@ -463,14 +451,13 @@ define i16 @test16(i1 *%addr, i16 %a) {
 define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL-LABEL: test17:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movzbl (%rdi), %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    kmovw %esi, %k2
+; KNL-NEXT:    movb (%rdi), %al
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kmovw %eax, %k2
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; KNL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -479,14 +466,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ;
 ; SKX-LABEL: test17:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movzbl (%rdi), %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    movb (%rdi), %al
+; SKX-NEXT:    kmovd %esi, %k0
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpmovm2q %k1, %zmm0
 ; SKX-NEXT:    vpmovm2q %k0, %zmm1
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; SKX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; SKX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
 ; SKX-NEXT:    vpmovq2m %zmm2, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
@@ -1283,12 +1269,11 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
-; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
+; SKX-NEXT:    kunpckwd %k0, %k1, %k0
+; SKX-NEXT:    vpmovm2w %k0, %zmm0
 ; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k1
-; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k2
-; SKX-NEXT:    kunpckwd %k1, %k2, %k1
-; SKX-NEXT:    vpmovm2w %k1, %zmm0
 ; SKX-NEXT:    vpmovm2w %k0, %zmm1
 ; SKX-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
 ; SKX-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
@@ -1308,33 +1293,29 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpextrd $1, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; KNL-NEXT:    vmovd %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT:    vpextrb $4, %xmm0, %ecx
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
 ; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpextrd $3, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vpextrb $12, %xmm0, %eax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
@@ -1349,10 +1330,9 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
-; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
 ; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k1
-; SKX-NEXT:    vpmovm2d %k1, %xmm0
 ; SKX-NEXT:    vpmovm2d %k0, %xmm1
 ; SKX-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
@@ -1373,16 +1353,14 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
 ; KNL-NEXT:    setb %al
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k2
-; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
@@ -1396,13 +1374,12 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlw $1, %k1, %k1
-; SKX-NEXT:    kshiftrw $1, %k1, %k1
 ; SKX-NEXT:    kshiftlw $1, %k0, %k0
-; SKX-NEXT:    korw %k0, %k1, %k0
+; SKX-NEXT:    kshiftrw $1, %k0, %k0
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
@@ -1422,8 +1399,10 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vpextrb $0, %xmm0, %eax
-; KNL-NEXT:    addb $4, %al
-; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    movb $4, %cl
+; KNL-NEXT:    subb %al, %cl
+; KNL-NEXT:    movzbl %cl, %eax
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_v2i1:
@@ -1432,11 +1411,10 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    cmpb $1, %al
-; SKX-NEXT:    movb $3, %al
-; SKX-NEXT:    adcb $0, %al
-; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    movb $4, %cl
+; SKX-NEXT:    subb %al, %cl
+; SKX-NEXT:    movzbl %cl, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <2 x i64> %a, %b
   %t2 = extractelement <2 x i1> %t1, i32 0
@@ -1452,8 +1430,10 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vpextrb $0, %xmm0, %eax
-; KNL-NEXT:    addb $4, %al
-; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    movb $4, %cl
+; KNL-NEXT:    subb %al, %cl
+; KNL-NEXT:    movzbl %cl, %eax
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: extractelement_v2i1_alt:
@@ -1462,11 +1442,10 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    cmpb $1, %al
-; SKX-NEXT:    movb $3, %al
-; SKX-NEXT:    adcb $0, %al
-; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    movb $4, %cl
+; SKX-NEXT:    subb %al, %cl
+; SKX-NEXT:    movzbl %cl, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <2 x i64> %a, %b
   %t2 = extractelement <2 x i1> %t1, i32 0
@@ -1535,8 +1514,10 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; KNL-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpextrb $15, %xmm0, %eax
-; KNL-NEXT:    addb $4, %al
-; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    movb $4, %cl
+; KNL-NEXT:    subb %al, %cl
+; KNL-NEXT:    movzbl %cl, %eax
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_v64i1:
@@ -1544,11 +1525,10 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
 ; SKX-NEXT:    kshiftrq $63, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    cmpb $1, %al
-; SKX-NEXT:    movb $3, %al
-; SKX-NEXT:    adcb $0, %al
-; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    movb $4, %cl
+; SKX-NEXT:    subb %al, %cl
+; SKX-NEXT:    movzbl %cl, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <64 x i8> %a, %b
@@ -1566,8 +1546,10 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
 ; KNL-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpextrb $15, %xmm0, %eax
-; KNL-NEXT:    addb $4, %al
-; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    movb $4, %cl
+; KNL-NEXT:    subb %al, %cl
+; KNL-NEXT:    movzbl %cl, %eax
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: extractelement_v64i1_alt:
@@ -1575,11 +1557,10 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
 ; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
 ; SKX-NEXT:    kshiftrq $63, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    cmpb $1, %al
-; SKX-NEXT:    movb $3, %al
-; SKX-NEXT:    adcb $0, %al
-; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    movb $4, %cl
+; SKX-NEXT:    subb %al, %cl
+; SKX-NEXT:    movzbl %cl, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <64 x i8> %a, %b
@@ -2332,7 +2313,7 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b,
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
 ; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    andl $1, %edi
-; SKX-NEXT:    movl -24(%rsp,%rdi,8), %eax
+; SKX-NEXT:    movzbl -24(%rsp,%rdi,8), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <2 x i64> %a, %b
@@ -2362,7 +2343,7 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
 ; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    andl $3, %edi
-; SKX-NEXT:    movl -24(%rsp,%rdi,4), %eax
+; SKX-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <4 x i32> %a, %b
@@ -2391,7 +2372,7 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 %zmm0, (%rsp)
 ; KNL-NEXT:    andl $7, %edi
-; KNL-NEXT:    movl (%rsp,%rdi,8), %eax
+; KNL-NEXT:    movzbl (%rsp,%rdi,8), %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
@@ -2414,7 +2395,7 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
 ; SKX-NEXT:    vpmovm2q %k0, %zmm0
 ; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
 ; SKX-NEXT:    andl $7, %edi
-; SKX-NEXT:    movl (%rsp,%rdi,8), %eax
+; SKX-NEXT:    movzbl (%rsp,%rdi,8), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
@@ -2444,7 +2425,7 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vmovdqa32 %zmm0, (%rsp)
 ; KNL-NEXT:    andl $15, %edi
-; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movzbl (%rsp,%rdi,4), %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
@@ -2467,7 +2448,7 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
 ; SKX-NEXT:    vpmovm2d %k0, %zmm0
 ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
 ; SKX-NEXT:    andl $15, %edi
-; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movzbl (%rsp,%rdi,4), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
@@ -2500,9 +2481,8 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
 ; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
 ; KNL-NEXT:    andl $31, %edi
 ; KNL-NEXT:    movq %rsp, %rax
-; KNL-NEXT:    movb (%rdi,%rax), %al
-; KNL-NEXT:    andb $1, %al
-; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    movzbl (%rdi,%rax), %eax
+; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
@@ -2524,7 +2504,7 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
 ; SKX-NEXT:    vpmovm2w %k0, %zmm0
 ; SKX-NEXT:    vmovdqu16 %zmm0, (%rsp)
 ; SKX-NEXT:    andl $31, %edi
-; SKX-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; SKX-NEXT:    movzbl (%rsp,%rdi,2), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll
index a1d1a7dae190..a099b80898ee 100644
--- a/test/CodeGen/X86/avx512-insert-extract_i1.ll
+++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll
@@ -22,9 +22,8 @@ define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b,
 ; SKX-NEXT:    vmovdqu8 %zmm0, (%rsp)
 ; SKX-NEXT:    andl $63, %edi
 ; SKX-NEXT:    movq %rsp, %rax
-; SKX-NEXT:    movb (%rdi,%rax), %al
-; SKX-NEXT:    andb $1, %al
-; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    movzbl (%rdi,%rax), %eax
+; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    movq %rbp, %rsp
 ; SKX-NEXT:    popq %rbp
 ; SKX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 56962ca2671d..32da0a70218e 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float>
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
 
@@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
 
@@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
@@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
@@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
@@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
@@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
@@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1,
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
@@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
+; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
@@ -2881,23 +2881,23 @@ define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8
 ; CHECK-LABEL: test_mask_vextractf32x4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kshiftlw $12, %k1, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kshiftlw $13, %k1, %k2
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kshiftlw $12, %k0, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kshiftlw $13, %k0, %k2
 ; CHECK-NEXT:    kshiftrw $15, %k2, %k2
-; CHECK-NEXT:    kshiftlw $15, %k1, %k3
+; CHECK-NEXT:    kshiftlw $15, %k0, %k3
 ; CHECK-NEXT:    kshiftrw $15, %k3, %k3
-; CHECK-NEXT:    kshiftlw $14, %k1, %k1
-; CHECK-NEXT:    kshiftrw $15, %k1, %k1
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kshiftlw $14, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    kmovw %k3, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm2
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
 ; CHECK-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -2911,23 +2911,23 @@ define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask)
 ; CHECK-LABEL: test_mask_vextracti64x4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kshiftlw $12, %k1, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kshiftlw $13, %k1, %k2
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kshiftlw $12, %k0, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kshiftlw $13, %k0, %k2
 ; CHECK-NEXT:    kshiftrw $15, %k2, %k2
-; CHECK-NEXT:    kshiftlw $15, %k1, %k3
+; CHECK-NEXT:    kshiftlw $15, %k0, %k3
 ; CHECK-NEXT:    kshiftrw $15, %k3, %k3
-; CHECK-NEXT:    kshiftlw $14, %k1, %k1
-; CHECK-NEXT:    kshiftrw $15, %k1, %k1
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kshiftlw $14, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    kmovw %k3, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm2
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmovsxdq %xmm2, %ymm2
 ; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
@@ -2942,23 +2942,23 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
 ; CHECK-LABEL: test_maskz_vextracti32x4:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    kshiftlw $12, %k1, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kshiftlw $13, %k1, %k2
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kshiftlw $12, %k0, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kshiftlw $13, %k0, %k2
 ; CHECK-NEXT:    kshiftrw $15, %k2, %k2
-; CHECK-NEXT:    kshiftlw $15, %k1, %k3
+; CHECK-NEXT:    kshiftlw $15, %k0, %k3
 ; CHECK-NEXT:    kshiftrw $15, %k3, %k3
-; CHECK-NEXT:    kshiftlw $14, %k1, %k1
-; CHECK-NEXT:    kshiftrw $15, %k1, %k1
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kshiftlw $14, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    kmovw %k3, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm1
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrad $31, %xmm1, %xmm1
 ; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
@@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
@@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
@@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
@@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index f800d01064ba..563cad04b8c2 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -112,6 +112,8 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) {
 }
 
 declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
+; TODO: the two kxnor instructions here a no op and should be elimintaed,
+; probably by FoldConstantArithmetic in SelectionDAG.
 define i16 @test_kxnor(i16 %a0, i16 %a1) {
 ; CHECK-LABEL: test_kxnor:
 ; CHECK:       ## BB#0:
@@ -121,6 +123,8 @@ define i16 @test_kxnor(i16 %a0, i16 %a1) {
 ; CHECK-NEXT:    kmovw %eax, %k2
 ; CHECK-NEXT:    kxorw %k0, %k1, %k0
 ; CHECK-NEXT:    kxorw %k0, %k2, %k0
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
@@ -269,16 +273,15 @@ declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x
 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_sqrt_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm2
-; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
@@ -296,16 +299,15 @@ declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <
 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_sqrt_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm2
-; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
@@ -477,11 +479,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
-; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
@@ -496,11 +498,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
-; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2si %xmm0, %rcx
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
@@ -515,11 +517,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %rax
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
-; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtss2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
@@ -534,11 +536,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %rax
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
-; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtss2si %xmm0, %rcx
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rax
+; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 
   %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
@@ -553,11 +555,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %edx
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
@@ -572,11 +574,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2si %xmm0, %ecx
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %edx
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
@@ -591,11 +593,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %eax
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtss2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %edx
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
@@ -610,11 +612,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %eax
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtss2si %xmm0, %ecx
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %edx
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 
   %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
@@ -683,9 +685,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; CHECK-NEXT:    vcvtps2ph $2, %zmm0, (%rsi)
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
@@ -2215,7 +2216,6 @@ declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>,
 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_ss_rn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2227,7 +2227,6 @@ define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_ss_rd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2239,7 +2238,6 @@ define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_ss_ru:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2251,7 +2249,6 @@ define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_ss_rz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2263,7 +2260,6 @@ define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_ss_current:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2275,7 +2271,6 @@ define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <
 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_add_ss_rn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2295,7 +2290,6 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
 define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_ss_current_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
@@ -2312,7 +2306,6 @@ define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1
 define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_add_ss_current_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2330,7 +2323,6 @@ declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x doubl
 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_sd_rn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2342,7 +2334,6 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_sd_rd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2354,7 +2345,6 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_sd_ru:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2366,7 +2356,6 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_sd_rz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2378,7 +2367,6 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_sd_current:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2390,7 +2378,6 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1
 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_add_sd_rn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2410,7 +2397,6 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
 define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_sd_current_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vaddsd (%rdi), %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
@@ -2425,7 +2411,6 @@ define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double*
 define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_add_sd_current_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2441,7 +2426,6 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>,
 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_ss_sae:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2453,7 +2437,6 @@ define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x
 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_max_ss_sae:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2473,7 +2456,6 @@ define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
@@ -2485,7 +2467,6 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x floa
 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_max_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2505,7 +2486,6 @@ define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
 define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_ss_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
@@ -2522,7 +2502,6 @@ define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x f
 define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_max_ss_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2539,7 +2518,6 @@ declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x doubl
 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_sd_sae:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2551,7 +2529,6 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_max_sd_sae:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2571,7 +2548,6 @@ define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
@@ -2583,7 +2559,6 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x d
 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_max_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -2603,7 +2578,6 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
 define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_sd_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
@@ -2618,7 +2592,6 @@ define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2
 define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
 ; CHECK-LABEL: test_maskz_max_sd_memfold:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -3652,16 +3625,15 @@ declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4
 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
 ; CHECK-LABEL: test_getexp_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm5
 ; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm2
-; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm5, %xmm4, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
@@ -3679,16 +3651,15 @@ declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>,
 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
 ; CHECK-LABEL: test_getexp_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3
-; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
+; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm5 {%k1} {z}
 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm2
-; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm4, %xmm0
+; CHECK-NEXT:    vaddpd %xmm3, %xmm5, %xmm1
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
@@ -3706,11 +3677,9 @@ declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32
 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 
@@ -3721,18 +3690,18 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8
 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
-; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k1
-; CHECK-NEXT:    korw %k0, %k1, %k0
-; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k2
-; CHECK-NEXT:    korw %k1, %k2, %k1
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k2
-; CHECK-NEXT:    kandw %k2, %k1, %k1
-; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %esi
+; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    orb %cl, %dl
+; CHECK-NEXT:    orb %sil, %al
+; CHECK-NEXT:    orb %dl, %al
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 
@@ -3752,11 +3721,9 @@ declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 
@@ -3768,17 +3735,17 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %
 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k1
-; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k2 {%k1}
-; CHECK-NEXT:    kmovw %k2, %ecx
-; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k1, %edx
-; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %esi
+; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    andb %cl, %dl
+; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    andb %dl, %al
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
@@ -3899,16 +3866,15 @@ declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3
-; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
-; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5 {%k1} {z}
 ; CHECK-NEXT:    vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm5, %xmm4, %xmm0
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
@@ -3925,15 +3891,14 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i
 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm2
-; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
@@ -4057,7 +4022,6 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x flo
 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
@@ -4074,7 +4038,6 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x doubl
 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
@@ -4435,8 +4398,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
@@ -4455,8 +4418,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
@@ -4557,9 +4520,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
 ; CHECK-NEXT:    vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
 ; CHECK-NEXT:    vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm4, %zmm3, %zmm3
 ; CHECK-NEXT:    vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vaddpd %zmm4, %zmm3, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
@@ -4580,9 +4543,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0,
 ; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
 ; CHECK-NEXT:    vmovapd %zmm0, %zmm5
 ; CHECK-NEXT:    vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm5, %zmm3, %zmm3
 ; CHECK-NEXT:    vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vaddpd %zmm5, %zmm3, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
   %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4597,16 +4560,15 @@ declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
 ; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm5
 ; CHECK-NEXT:    vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
-; CHECK-NEXT:    vaddps %xmm5, %xmm3, %xmm3
 ; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm5, %xmm3, %xmm1
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4621,16 +4583,15 @@ declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm4
+; CHECK-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
@@ -4651,9 +4612,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
 ; CHECK-NEXT:    vpxord %zmm4, %zmm4, %zmm4
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm5
 ; CHECK-NEXT:    vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
-; CHECK-NEXT:    vaddps %zmm5, %zmm3, %zmm3
 ; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vaddps %zmm5, %zmm3, %zmm1
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
@@ -4691,16 +4652,15 @@ declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double
 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
-; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
-; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm4
+; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
@@ -4715,16 +4675,15 @@ declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x doubl
 define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
 ; CHECK-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm5
 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm5, %xmm3, %xmm3
 ; CHECK-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm5, %xmm3, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
@@ -4816,18 +4775,17 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>,
 define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT:    vmovapd %xmm0, %xmm4
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm4
-; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1}
-; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1}
+; CHECK-NEXT:    vmovapd %xmm0, %xmm5
+; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5
+; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4844,18 +4802,17 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1}
-; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1}
+; CHECK-NEXT:    vmovaps %xmm0, %xmm5
+; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5
+; CHECK-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm1
+; CHECK-NEXT:    vaddps %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -4872,7 +4829,6 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm0, %xmm3
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z}
@@ -4890,7 +4846,6 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -4904,18 +4859,17 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
-; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovapd %xmm2, %xmm4
-; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
-; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
-; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vmovapd %xmm2, %xmm5
+; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm5, %xmm1
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4932,18 +4886,17 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
-; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm4
-; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
-; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
-; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm5
+; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm5, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -4959,7 +4912,6 @@ define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
 ; CHECK-LABEL: fmadd_ss_mask_memfold:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    andl $1, %edx
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1}
 ; CHECK-NEXT:    vmovss %xmm0, (%rdi)
@@ -4987,7 +4939,6 @@ define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
 ; CHECK-LABEL: fmadd_ss_maskz_memfold:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    andl $1, %edx
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vmovss %xmm0, (%rdi)
@@ -5015,7 +4966,6 @@ define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
 ; CHECK-LABEL: fmadd_sd_mask_memfold:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    andl $1, %edx
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1}
 ; CHECK-NEXT:    vmovlpd %xmm0, (%rdi)
@@ -5039,7 +4989,6 @@ define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
 ; CHECK-LABEL: fmadd_sd_maskz_memfold:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    andl $1, %edx
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vmovlpd %xmm0, (%rdi)
@@ -5064,18 +5013,17 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
-; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovapd %xmm2, %xmm4
-; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
-; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
-; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vmovapd %xmm2, %xmm5
+; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm5, %xmm1
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -5092,18 +5040,17 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
-; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm4
-; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
-; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
-; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm5
+; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm5, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -5120,18 +5067,17 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double
 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm3
-; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovapd %xmm2, %xmm4
-; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm4
-; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
-; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddpd %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vmovapd %xmm2, %xmm5
+; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm5, %xmm1
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -5148,18 +5094,17 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm3
-; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm4
-; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm4
-; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
-; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vaddps %xmm4, %xmm2, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm5
+; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddps %xmm4, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm5, %xmm1
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -5174,7 +5119,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
@@ -5188,7 +5132,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -5202,7 +5145,8 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f
 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %q = load float, float* %ptr_b
diff --git a/test/CodeGen/X86/avx512-load-store.ll b/test/CodeGen/X86/avx512-load-store.ll
index 3295c66c6d42..4fd985bf24cd 100644
--- a/test/CodeGen/X86/avx512-load-store.ll
+++ b/test/CodeGen/X86/avx512-load-store.ll
@@ -12,7 +12,7 @@ define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x
 ; CHECK32-LABEL: test_mm_mask_move_ss:
 ; CHECK32:       # BB#0: # %entry
 ; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    andb $1, %al
 ; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
 ; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
@@ -37,7 +37,7 @@ define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4
 ; CHECK32-LABEL: test_mm_maskz_move_ss:
 ; CHECK32:       # BB#0: # %entry
 ; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    andb $1, %al
 ; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK32-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
@@ -62,7 +62,7 @@ define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2
 ; CHECK32-LABEL: test_mm_mask_move_sd:
 ; CHECK32:       # BB#0: # %entry
 ; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    andb $1, %al
 ; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
@@ -87,7 +87,7 @@ define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <
 ; CHECK32-LABEL: test_mm_maskz_move_sd:
 ; CHECK32:       # BB#0: # %entry
 ; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    andb $1, %al
 ; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK32-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
diff --git a/test/CodeGen/X86/avx512-mask-bugfix.ll b/test/CodeGen/X86/avx512-mask-bugfix.ll
deleted file mode 100755
index 1940680f1c10..000000000000
--- a/test/CodeGen/X86/avx512-mask-bugfix.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s
-
-; ModuleID = 'foo.ll'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #0
-
-; Function Attrs: nounwind readnone
-declare i64 @llvm.cttz.i64(i64, i1) #0
-
-; Function Attrs: nounwind
-define void @foo(float* noalias %aFOO, float %b, i32 %a) {
-allocas:
-  %full_mask_memory.i57 = alloca <8 x float>
-  %return_value_memory.i60 = alloca i1
-  %cmp.i = icmp eq i32 %a, 65535
-  br i1 %cmp.i, label %all_on, label %some_on
-
-all_on:
-  %mask0 = load <8 x float>, <8 x float>* %full_mask_memory.i57
-  %v0.i.i.i70 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) #0
-  %allon.i.i76 = icmp eq i32 %v0.i.i.i70, 65535
-  br i1 %allon.i.i76, label %check_neighbors.i.i121, label %domixed.i.i100
-
-domixed.i.i100: 
-  br label %check_neighbors.i.i121
-
-check_neighbors.i.i121: 
-  %v1.i5.i.i116 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) #0
-  %alleq.i.i120 = icmp eq i32 %v1.i5.i.i116, 65535
-  br i1 %alleq.i.i120, label %all_equal.i.i123, label %not_all_equal.i.i124
-
-; CHECK: kxnorw  %k0, %k0, %k0
-; CHECK: kshiftrw        $15, %k0, %k0
-; CHECK: jmp
-; CHECK: kxorw   %k0, %k0, %k0
-
-all_equal.i.i123:
-  br label %reduce_equal___vyi.exit128
-
-not_all_equal.i.i124:        
-  br label %reduce_equal___vyi.exit128
-
-reduce_equal___vyi.exit128:
-  %calltmp2.i125 = phi i1 [ true, %all_equal.i.i123 ], [ false, %not_all_equal.i.i124 ]
-  store i1 %calltmp2.i125, i1* %return_value_memory.i60
-  %return_value.i126 = load i1, i1* %return_value_memory.i60
-  %. = select i1 %return_value.i126, i32 1, i32 0
-  %select_to_float = sitofp i32 %. to float
-  ret void
-
-some_on:
-  ret void
-}
-
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 7103efe050a4..01153a9e45f7 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -418,7 +418,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 ; KNL-NEXT:    kshiftlw $10, %k0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    andb $1, %al
 ; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
@@ -428,7 +428,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 ; SKX-NEXT:    kshiftlw $10, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    andb $1, %al
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -439,7 +439,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512BW-NEXT:    kshiftlw $10, %k0, %k0
 ; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    andb $1, %al
 ; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -450,7 +450,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    andb $1, %al
 ; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -965,8 +965,8 @@ define <64 x i8> @test16(i64 %x) {
 ; SKX-LABEL: test16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    kmovq %rdi, %k0
-; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    movb $1, %al
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpmovm2b %k1, %zmm0
 ; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm1
@@ -981,8 +981,8 @@ define <64 x i8> @test16(i64 %x) {
 ; AVX512BW-LABEL: test16:
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
-; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512BW-NEXT:    movb $1, %al
+; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
 ; AVX512BW-NEXT:    vpsllq $40, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
@@ -1085,7 +1085,6 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; SKX-NEXT:    kmovq %rdi, %k0
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
-; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpmovm2b %k1, %zmm0
 ; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
@@ -1103,7 +1102,6 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; AVX512BW-NEXT:    kmovq %rdi, %k0
 ; AVX512BW-NEXT:    cmpl %edx, %esi
 ; AVX512BW-NEXT:    setg %al
-; AVX512BW-NEXT:    andl $1, %eax
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
 ; AVX512BW-NEXT:    vpsllq $40, %xmm0, %xmm0
@@ -1166,21 +1164,25 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-LABEL: test18:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kmovw %esi, %k2
-; KNL-NEXT:    kshiftlw $7, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kshiftlw $6, %k2, %k2
+; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    kshiftlw $7, %k0, %k2
 ; KNL-NEXT:    kshiftrw $15, %k2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $6, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
 ; KNL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT:    kshiftlw $1, %k1, %k1
-; KNL-NEXT:    kshiftrw $1, %k1, %k1
-; KNL-NEXT:    kshiftlw $7, %k0, %k0
-; KNL-NEXT:    korw %k0, %k1, %k1
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $1, %k0, %k0
+; KNL-NEXT:    kshiftrw $1, %k0, %k0
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftlw $7, %k1, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqw %zmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -1191,16 +1193,20 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kshiftlw $7, %k1, %k2
 ; SKX-NEXT:    kshiftrw $15, %k2, %k2
+; SKX-NEXT:    kmovd %k2, %eax
 ; SKX-NEXT:    kshiftlw $6, %k1, %k1
 ; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kmovd %k1, %ecx
 ; SKX-NEXT:    vpmovm2q %k0, %zmm0
-; SKX-NEXT:    vpmovm2q %k1, %zmm1
+; SKX-NEXT:    kmovd %ecx, %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm1
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
 ; SKX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; SKX-NEXT:    vpmovq2m %zmm2, %k0
 ; SKX-NEXT:    kshiftlb $1, %k0, %k0
 ; SKX-NEXT:    kshiftrb $1, %k0, %k0
-; SKX-NEXT:    kshiftlb $7, %k2, %k1
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
 ; SKX-NEXT:    vzeroupper
@@ -1209,21 +1215,25 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; AVX512BW-LABEL: test18:
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    kmovd %esi, %k2
-; AVX512BW-NEXT:    kshiftlw $7, %k2, %k0
-; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
-; AVX512BW-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512BW-NEXT:    kmovd %esi, %k0
+; AVX512BW-NEXT:    kshiftlw $7, %k0, %k2
 ; AVX512BW-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512BW-NEXT:    kmovd %k2, %eax
+; AVX512BW-NEXT:    kshiftlw $6, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %ecx
 ; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512BW-NEXT:    kmovd %ecx, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsllq $63, %zmm2, %zmm0
-; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT:    kshiftlw $7, %k0, %k0
-; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    kshiftlw $7, %k1, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
 ; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    vzeroupper
@@ -1235,16 +1245,20 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; AVX512DQ-NEXT:    kmovw %esi, %k1
 ; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k2
 ; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512DQ-NEXT:    kmovw %k2, %eax
 ; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k1
 ; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
 ; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovm2q %k1, %zmm1
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm1
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
 ; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vpmovq2m %zmm2, %k0
 ; AVX512DQ-NEXT:    kshiftlb $1, %k0, %k0
 ; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k0
-; AVX512DQ-NEXT:    kshiftlb $7, %k2, %k1
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    kshiftlb $7, %k1, %k1
 ; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
@@ -1383,10 +1397,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
 define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; KNL-LABEL: store_v1i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kxnorw %k0, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kxorw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rsi)
@@ -1394,20 +1406,16 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ;
 ; SKX-LABEL: store_v1i1:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    andl $1, %edi
 ; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kshiftrw $15, %k1, %k1
 ; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rsi)
 ; SKX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: store_v1i1:
 ; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    andl $1, %edi
 ; AVX512BW-NEXT:    kmovd %edi, %k0
 ; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    movb %al, (%rsi)
@@ -1415,10 +1423,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ;
 ; AVX512DQ-LABEL: store_v1i1:
 ; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    andl $1, %edi
 ; AVX512DQ-NEXT:    kmovw %edi, %k0
 ; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
 ; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kmovb %k0, (%rsi)
 ; AVX512DQ-NEXT:    retq
@@ -1613,59 +1619,14 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
 @f1.v = internal unnamed_addr global i1 false, align 4
 
 define void @f1(i32 %c) {
-; KNL-LABEL: f1:
-; KNL:       ## BB#0: ## %entry
-; KNL-NEXT:    movzbl {{.*}}(%rip), %edi
-; KNL-NEXT:    movl %edi, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k0
-; KNL-NEXT:    kxnorw %k0, %k0, %k1
-; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    movb %al, {{.*}}(%rip)
-; KNL-NEXT:    xorl $1, %edi
-; KNL-NEXT:    jmp _f2 ## TAILCALL
-;
-; SKX-LABEL: f1:
-; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    movzbl {{.*}}(%rip), %edi
-; SKX-NEXT:    movl %edi, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    kshiftrw $15, %k1, %k1
-; SKX-NEXT:    kxorw %k1, %k0, %k0
-; SKX-NEXT:    kmovb %k0, {{.*}}(%rip)
-; SKX-NEXT:    xorl $1, %edi
-; SKX-NEXT:    jmp _f2 ## TAILCALL
-;
-; AVX512BW-LABEL: f1:
-; AVX512BW:       ## BB#0: ## %entry
-; AVX512BW-NEXT:    movzbl {{.*}}(%rip), %edi
-; AVX512BW-NEXT:    movl %edi, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    kmovd %eax, %k0
-; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movb %al, {{.*}}(%rip)
-; AVX512BW-NEXT:    xorl $1, %edi
-; AVX512BW-NEXT:    jmp _f2 ## TAILCALL
-;
-; AVX512DQ-LABEL: f1:
-; AVX512DQ:       ## BB#0: ## %entry
-; AVX512DQ-NEXT:    movzbl {{.*}}(%rip), %edi
-; AVX512DQ-NEXT:    movl %edi, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    kmovw %eax, %k0
-; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    kmovb %k0, {{.*}}(%rip)
-; AVX512DQ-NEXT:    xorl $1, %edi
-; AVX512DQ-NEXT:    jmp _f2 ## TAILCALL
+; CHECK-LABEL: f1:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movzbl {{.*}}(%rip), %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorb $1, %al
+; CHECK-NEXT:    movb %al, {{.*}}(%rip)
+; CHECK-NEXT:    xorl $1, %edi
+; CHECK-NEXT:    jmp _f2 ## TAILCALL
 entry:
   %.b1 = load i1, i1* @f1.v, align 4
   %not..b1 = xor i1 %.b1, true
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
index 96aefdb10584..4ef88ac495c3 100644
--- a/test/CodeGen/X86/avx512-mask-spills.ll
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -9,11 +9,13 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:  Lcfi0:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
-; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2d %k0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -32,12 +34,14 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-NEXT:  Lcfi1:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
-; CHECK-NEXT:    korb %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT:    korb %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -56,12 +60,14 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:  Lcfi2:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -79,12 +85,14 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:  Lcfi3:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
-; CHECK-NEXT:    kord %k1, %k0, %k0
 ; CHECK-NEXT:    kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovd %k0, (%rsp) ## 4-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
+; CHECK-NEXT:    kmovd (%rsp), %k1 ## 4-byte Reload
+; CHECK-NEXT:    kord %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %ymm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -98,18 +106,20 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
 define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-LABEL: test_64i1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:  Lcfi4:
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1
-; CHECK-NEXT:    korq %k1, %k0, %k0
-; CHECK-NEXT:    kmovq %k0, (%rsp) ## 8-byte Spill
+; CHECK-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
-; CHECK-NEXT:    kmovq (%rsp), %k0 ## 8-byte Reload
+; CHECK-NEXT:    kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
+; CHECK-NEXT:    kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
+; CHECK-NEXT:    korq %k1, %k0, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %zmm0
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    retq
 
   %cmp_res = icmp ugt <64 x i8> %a, %b
diff --git a/test/CodeGen/X86/avx512-memfold.ll b/test/CodeGen/X86/avx512-memfold.ll
index d754b2b78f6c..17cb30255f75 100644
--- a/test/CodeGen/X86/avx512-memfold.ll
+++ b/test/CodeGen/X86/avx512-memfold.ll
@@ -4,11 +4,9 @@
 define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vcmpunordss (%rdi), %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %b.val = load float, float* %b
@@ -24,7 +22,6 @@ declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
 define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_max_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -41,7 +38,6 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>,
 define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {
 ; CHECK-LABEL: test_maskz_add_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -61,7 +57,6 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>,
 define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %esi
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 334097917853..f43d5b3e11dd 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -1,16 +1,10 @@
-; RUN: llc < %s -mtriple=i386-pc-win32       -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck --check-prefix=X32 %s
-; RUN: llc < %s -mtriple=x86_64-win32        -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=i386-pc-win32       -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck --check-prefix=ALL --check-prefix=X32 %s
+; RUN: llc < %s -mtriple=x86_64-win32        -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck --check-prefix=ALL --check-prefix=WIN64 %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu    -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck --check-prefix=LINUXOSX64 %s 
 
-; X32-LABEL:  test_argReti1:
-; X32:        kmov{{.*}}  %eax, %k{{[0-7]}}
-; X32:        kmov{{.*}}  %k{{[0-7]}}, %eax
-; X32:        ret{{.*}}
-
-; WIN64-LABEL:  test_argReti1:
-; WIN64:        kmov{{.*}}  %eax, %k{{[0-7]}}
-; WIN64:        kmov{{.*}}  %k{{[0-7]}}, %eax
-; WIN64:        ret{{.*}}
+; ALL-LABEL:  test_argReti1:
+; ALL:        incb %al
+; ALL:        ret{{.*}}
 
 ; Test regcall when receiving/returning i1
 define x86_regcallcc i1 @test_argReti1(i1 %a)  {
@@ -18,17 +12,11 @@ define x86_regcallcc i1 @test_argReti1(i1 %a)  {
   ret i1 %add
 }
 
-; X32-LABEL:  test_CallargReti1:
-; X32:        kmov{{.*}}  %k{{[0-7]}}, %eax
-; X32:        call{{.*}}   {{.*}}test_argReti1
-; X32:        kmov{{.*}}  %eax, %k{{[0-7]}}
-; X32:        ret{{.*}}
-
-; WIN64-LABEL:  test_CallargReti1:
-; WIN64:        kmov{{.*}}  %k{{[0-7]}}, %eax
-; WIN64:        call{{.*}}   {{.*}}test_argReti1
-; WIN64:        kmov{{.*}}  %eax, %k{{[0-7]}}
-; WIN64:        ret{{.*}}
+; ALL-LABEL:  test_CallargReti1:
+; ALL:        movzbl      %al, %eax
+; ALL:        call{{.*}}test_argReti1
+; ALL:        incb        %al
+; ALL:        ret{{.*}}
 
 ; Test regcall when passing/retrieving i1
 define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
diff --git a/test/CodeGen/X86/avx512-scalar_mask.ll b/test/CodeGen/X86/avx512-scalar_mask.ll
index 47c6813fa8dc..f6ee8ff4c0f6 100644
--- a/test/CodeGen/X86/avx512-scalar_mask.ll
+++ b/test/CodeGen/X86/avx512-scalar_mask.ll
@@ -7,7 +7,6 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
 ; CHECK-LABEL: test_var_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -18,7 +17,6 @@ define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %
 define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
 ; CHECK-LABEL: test_var_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
@@ -30,7 +28,8 @@ define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float>
 define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const0_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 0, i32 4)
@@ -41,7 +40,8 @@ define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float
 define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const0_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 0, i32 4)
@@ -52,7 +52,8 @@ define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa
 define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const2_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    movb $2, %al
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 2, i32 4)
@@ -63,7 +64,8 @@ define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float
 define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
 ; CHECK-LABEL: test_const2_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    movb $2, %al
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2,  i8 2, i32 4)
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 1859b1bcfaf6..e81f983d9fe6 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -161,7 +161,7 @@ define i64 @pr30249() {
 define double @pr30561_f64(double %b, double %a, i1 %c) {
 ; CHECK-LABEL: pr30561_f64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
@@ -172,7 +172,7 @@ define double @pr30561_f64(double %b, double %a, i1 %c) {
 define float @pr30561_f32(float %b, float %a, i1 %c) {
 ; CHECK-LABEL: pr30561_f32:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index faa055dfbbf3..9b4e73a18fc2 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
@@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm2
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm2
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 13b850ccc3b6..3337f42eb142 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
@@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
@@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
@@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 571f345d4616..7df07b0413ed 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT:    vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
+; CHECK-NEXT:    vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
@@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
+; CHECK-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
@@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16>
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
+; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
@@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
+; CHECK-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
@@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
 ; CHECK-NEXT:    kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT:    vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
+; CHECK-NEXT:    vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
 ; CHECK-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
@@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16>
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT:    vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
+; CHECK-NEXT:    vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
 ; CHECK-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
@@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
-; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
+; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
-; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
@@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1,
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index f8f47c87100a..8f528394f5bd 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32>
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 96254f7c95b0..37aea45e6107 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index 1377733739fe..cf79819734a2 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -13,10 +13,9 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0,
 ; CHECK-NEXT:    kshiftlb $6, %k0, %k0
 ; CHECK-NEXT:    kshiftrb $7, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vmovq %rax, %xmm2
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vmovq %rax, %xmm3
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; CHECK-NEXT:    kmovw %k1, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm2
+; CHECK-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; CHECK-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; CHECK-NEXT:    vpsraq $63, %zmm2, %zmm2
 ; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
@@ -40,8 +39,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res  = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index 97ac0fde10ec..06ee237593e7 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -262,7 +262,6 @@ declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4
 define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vreducess $4, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vreducess $4, {sae}, %xmm1, %xmm0, %xmm0
@@ -279,7 +278,6 @@ declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x
 define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vrangess $4, {sae}, %xmm1, %xmm0, %xmm0
@@ -296,7 +294,6 @@ declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,
 define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0
@@ -313,7 +310,6 @@ declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<
 define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0
@@ -367,14 +363,11 @@ declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclasssd $2, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    vfpclasssd $4, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    addb %cl, %al
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
@@ -389,14 +382,11 @@ declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %ecx
-; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    vfpclassss $4, %xmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    addb %cl, %al
 ; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
@@ -414,8 +404,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0,
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>  %x0, <16 x float> %x2, i16 %x3)
@@ -434,8 +424,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>  %x0, <16 x i32> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 595b3e0ebb86..52a84deebf51 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0,
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
 ; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
+; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
@@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
-; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3]
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
@@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
 ; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index 1bfdfd0e634d..ad9ea93c2031 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0,
 ; CHECK-NEXT:    ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
 ; CHECK-NEXT:    ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>  %x0, <8 x float> %x2, i8 %x3)
@@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>  %x0, <4 x i32> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index b8531e25bfa1..0e4922f37bbb 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -121,7 +121,6 @@ declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x flo
 define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_ss_maskz:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -132,7 +131,6 @@ define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
 define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_ss_mask:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
@@ -144,7 +142,6 @@ define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x
 define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_maskz:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -155,7 +152,6 @@ define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
 define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_mask:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
@@ -169,7 +165,6 @@ declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2
 define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %esi # encoding: [0x83,0xe6,0x01]
 ; CHECK-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -182,7 +177,6 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i
 define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %esi # encoding: [0x83,0xe6,0x01]
 ; CHECK-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 9659dc6d455a..30ecc0d2e49e 100644
--- a/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index b2fe6eba88ab..3ca686cef3bf 100644
--- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index c2d8df6476b3..4d906a4fd29a 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
-; CHECK-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
@@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
-; CHECK-NEXT:    vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
+; CHECK-NEXT:    vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
-; CHECK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
+; CHECK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
 ; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
@@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
+; CHECK-NEXT:    vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
-; CHECK-NEXT:    vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
+; CHECK-NEXT:    vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
-; CHECK-NEXT:    vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
+; CHECK-NEXT:    vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0,0,2,2]
-; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[1,1,3,3]
-; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0,0]
-; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
 ; CHECK-NEXT:    vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[0,0]
+; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
@@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[0,0,2,2]
-; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
 ; CHECK-NEXT:    vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
@@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm0[1],xmm1[0]
-; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3]
 ; CHECK-NEXT:    vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[1],xmm1[0]
-; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb]
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1)
@@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
-; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3]
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1]
-; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2
 ; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03]
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
 ; CHECK-NEXT:    vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
 ; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4
 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03]
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4
 ; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03]
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8
 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03]
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03]
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32>
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
 ; CHECK-NEXT:    valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1)
@@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
-; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3]
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
@@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3
 ; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
+; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
 
   %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 684b0468cf51..1f324d679564 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
 ; CHECK-NEXT:    vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
 ; CHECK-NEXT:    vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
 ; CHECK-NEXT:    vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
 ; CHECK-NEXT:    vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
 ; CHECK-NEXT:    vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
 ; CHECK-NEXT:    vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
 ; CHECK-NEXT:    vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
 ; CHECK-NEXT:    vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <
 ; CHECK-NEXT:    vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
 ; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
 ; CHECK-NEXT:    vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
-; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc]
 ; CHECK-NEXT:    vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
-; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc]
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
@@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <
 ; CHECK-NEXT:    vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
 ; CHECK-NEXT:    vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
 ; CHECK-NEXT:    vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
-; CHECK-NEXT:    vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
 ; CHECK-NEXT:    vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
@@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0,
 ; CHECK-NEXT:    vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
 ; CHECK-NEXT:    vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
-; CHECK-NEXT:    vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
 ; CHECK-NEXT:    vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
   %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
new file mode 100644
index 000000000000..d1508f99fc71
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -0,0 +1,823 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512
+
+define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-SSSE3-LABEL: v8i16:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $5, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $3, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $2, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v8i16:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    retq
+  %x = icmp sgt <8 x i16> %a, %b
+  %res = bitcast <8 x i1> %x to i8
+  ret i8 %res
+}
+
+define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-SSSE3-LABEL: v4i32:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v4i32:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <4 x i32> %a, %b
+  %res = bitcast <4 x i1> %x to i4
+  ret i4 %res
+}
+
+define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
+; SSE2-SSSE3-LABEL: v4f32:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    cmpltps %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v4f32:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vextractps $3, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vextractps $2, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vextractps $1, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vextractps $0, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v4f32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = fcmp ogt <4 x float> %a, %b
+  %res = bitcast <4 x i1> %x to i4
+  ret i4 %res
+}
+
+define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-SSSE3-LABEL: v16i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT:    andb $1, %cl
+; SSE2-SSSE3-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v16i8:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    andb $1, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    retq
+  %x = icmp sgt <16 x i8> %a, %b
+  %res = bitcast <16 x i1> %x to i16
+  ret i16 %res
+}
+
+define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
+; SSE2-SSSE3-LABEL: v2i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllq $56, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT:    psllq $56, %xmm1
+; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movq %xmm1, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i8:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllq $56, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v2i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $56, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $56, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <2 x i8> %a, %b
+  %res = bitcast <2 x i1> %x to i2
+  ret i2 %res
+}
+
+define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
+; SSE2-SSSE3-LABEL: v2i16:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllq $48, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-SSSE3-NEXT:    psllq $48, %xmm1
+; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movq %xmm1, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i16:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v2i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $48, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <2 x i16> %a, %b
+  %res = bitcast <2 x i1> %x to i2
+  ret i2 %res
+}
+
+define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
+; SSE2-SSSE3-LABEL: v2i32:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllq $32, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-SSSE3-NEXT:    psllq $32, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movq %xmm1, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i32:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v2i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <2 x i32> %a, %b
+  %res = bitcast <2 x i1> %x to i2
+  ret i2 %res
+}
+
+define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-SSSE3-LABEL: v2i64:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movq %xmm1, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i64:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <2 x i64> %a, %b
+  %res = bitcast <2 x i1> %x to i2
+  ret i2 %res
+}
+
+define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
+; SSE2-SSSE3-LABEL: v2f64:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    cmpltpd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movq %xmm1, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2f64:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v2f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = fcmp ogt <2 x double> %a, %b
+  %res = bitcast <2 x i1> %x to i2
+  ret i2 %res
+}
+
+define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
+; SSE2-SSSE3-LABEL: v4i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pslld $24, %xmm1
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm1
+; SSE2-SSSE3-NEXT:    pslld $24, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v4i8:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v4i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <4 x i8> %a, %b
+  %res = bitcast <4 x i1> %x to i4
+  ret i4 %res
+}
+
+define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
+; SSE2-SSSE3-LABEL: v4i16:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm1
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v4i16:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v4i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x = icmp sgt <4 x i16> %a, %b
+  %res = bitcast <4 x i1> %x to i4
+  ret i4 %res
+}
+
+define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
+; SSE2-SSSE3-LABEL: v8i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllw $8, %xmm1
+; SSE2-SSSE3-NEXT:    psraw $8, %xmm1
+; SSE2-SSSE3-NEXT:    psllw $8, %xmm0
+; SSE2-SSSE3-NEXT:    psraw $8, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $6, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $5, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $3, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $2, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v8i8:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: v8i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    retq
+  %x = icmp sgt <8 x i8> %a, %b
+  %res = bitcast <8 x i1> %x to i8
+  ret i8 %res
+}
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
new file mode 100644
index 000000000000..51c6ad7c7f9e
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
+
+define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
+; AVX2-LABEL: v16i16:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = icmp sgt <16 x i16> %a, %b
+  %res = bitcast <16 x i1> %x to i16
+  ret i16 %res
+}
+
+define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
+; AVX2-LABEL: v8i32:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = icmp sgt <8 x i32> %a, %b
+  %res = bitcast <8 x i1> %x to i8
+  ret i8 %res
+}
+
+define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
+; AVX2-LABEL: v8f32:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8f32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = fcmp ogt <8 x float> %a, %b
+  %res = bitcast <8 x i1> %x to i8
+  ret i8 %res
+}
+
+define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
+; AVX2-LABEL: v32i8:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:  Lcfi0:
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:  Lcfi1:
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:  Lcfi2:
+; AVX2-NEXT:    .cfi_def_cfa_register %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $32, %rsp
+; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $11, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $9, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $7, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $3, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $0, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    movl (%rsp), %eax
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = icmp sgt <32 x i8> %a, %b
+  %res = bitcast <32 x i1> %x to i32
+  ret i32 %res
+}
+
+define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
+; AVX2-LABEL: v4i64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = icmp sgt <4 x i64> %a, %b
+  %res = bitcast <4 x i1> %x to i4
+  ret i4 %res
+}
+
+define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
+; AVX2-LABEL: v4f64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x = fcmp ogt <4 x double> %a, %b
+  %res = bitcast <4 x i1> %x to i4
+  ret i4 %res
+}
diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll
index 1340b7662a7a..a9c74df9d0d9 100644
--- a/test/CodeGen/X86/bswap_tree2.ll
+++ b/test/CodeGen/X86/bswap_tree2.ll
@@ -9,32 +9,31 @@
   define i32 @test1(i32 %x) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andl $16711680, %ecx # imm = 0xFF0000
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    orl $-16777216, %edx # imm = 0xFF000000
-; CHECK-NEXT:    shll $8, %ecx
-; CHECK-NEXT:    shrl $8, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    andl $16711680, %edx # imm = 0xFF0000
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    shrl $8, %eax
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrl $16, %ecx
 ; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: test1:
 ; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movl %edi, %eax
-; CHECK64-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; CHECK64-NEXT:    movl %edi, %ecx
-; CHECK64-NEXT:    orl $-16777216, %ecx # imm = 0xFF000000
-; CHECK64-NEXT:    shll $8, %eax
-; CHECK64-NEXT:    shrl $8, %ecx
-; CHECK64-NEXT:    orl %eax, %ecx
+; CHECK64-NEXT:    andl $16711680, %ecx # imm = 0xFF0000
+; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
+; CHECK64-NEXT:    shll $8, %ecx
+; CHECK64-NEXT:    shrl $8, %eax
 ; CHECK64-NEXT:    bswapl %edi
 ; CHECK64-NEXT:    shrl $16, %edi
-; CHECK64-NEXT:    orl %ecx, %edi
-; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    orl %edi, %eax
 ; CHECK64-NEXT:    retq
   %byte0 = and i32 %x, 255        ; 0x000000ff
   %byte1 = and i32 %x, 65280      ; 0x0000ff00
diff --git a/test/CodeGen/X86/constant-combines.ll b/test/CodeGen/X86/constant-combines.ll
index 5ea736e92c78..4f55814958f4 100644
--- a/test/CodeGen/X86/constant-combines.ll
+++ b/test/CodeGen/X86/constant-combines.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@@ -11,13 +12,20 @@ define void @PR22524({ float, float }* %arg) {
 ; it folded it to a zero too late to legalize the zero store operation. If this
 ; ever starts forming a zero store instead of movss, the test case has stopped
 ; being useful.
-; 
+;
 ; CHECK-LABEL: PR22524:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl $0, 4(%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    mulss %xmm0, %xmm1
+; CHECK-NEXT:    movl $0, (%rdi)
+; CHECK-NEXT:    movss %xmm1, 4(%rdi)
+; CHECK-NEXT:    retq
 entry:
   %0 = getelementptr inbounds { float, float }, { float, float }* %arg,  i32 0, i32 1
   store float 0.000000e+00, float* %0, align 4
-; CHECK: movl $0, 4(%rdi)
-
   %1 = getelementptr inbounds { float, float }, { float, float }* %arg, i64 0,  i32 0
   %2 = bitcast float* %1 to i64*
   %3 = load i64, i64* %2, align 8
@@ -28,8 +36,6 @@ entry:
   %8 = fmul float %7, 0.000000e+00
   %9 = bitcast float* %1 to i32*
   store i32 %6, i32* %9, align 4
-; CHECK: movl $0, (%rdi)
   store float %8, float* %0, align 4
-; CHECK: movss %{{.*}}, 4(%rdi)
   ret void
 }
diff --git a/test/CodeGen/X86/fast-isel-load-i1.ll b/test/CodeGen/X86/fast-isel-load-i1.ll
index 2f3c6c4b84b9..f515d38cbb95 100644
--- a/test/CodeGen/X86/fast-isel-load-i1.ll
+++ b/test/CodeGen/X86/fast-isel-load-i1.ll
@@ -4,9 +4,7 @@
 define i1 @test_i1(i1* %b) {
 ; CHECK-LABEL: test_i1:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movzbl (%rdi), %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    testb $1, (%rdi)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # BB#1: # %in
 ; CHECK-NEXT:    xorl %eax, %eax
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index bb332f7282a8..d1d69c68af7b 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -141,7 +141,6 @@ define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test11:
 ; SKX:       # BB#0: # %entry
 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0
-; SKX-NEXT:    andl $1, %edi
 ; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
@@ -150,7 +149,6 @@ define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; KNL:       # BB#0: # %entry
 ; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm0
 ; KNL-NEXT:    vxorps %xmm0, %xmm2, %xmm0
-; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
 ; KNL-NEXT:    retq
@@ -186,7 +184,6 @@ define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i
 ; SKX-LABEL: test13:
 ; SKX:       # BB#0: # %entry
 ; SKX-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; SKX-NEXT:    andl $1, %edi
 ; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
@@ -194,10 +191,10 @@ define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i
 ; KNL-LABEL: test13:
 ; KNL:       # BB#0: # %entry
 ; KNL-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
 ; KNL-NEXT:    retq
+
 entry:
   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
   %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll
new file mode 100644
index 000000000000..bd8888966cf2
--- /dev/null
+++ b/test/CodeGen/X86/fmsubadd-combine.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
+
+; This test checks the fusing of MUL + SUB/ADD to FMSUBADD.
+
+define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
+; FMA3_256-LABEL: mul_subadd_pd128:
+; FMA3_256:       # BB#0: # %entry
+; FMA3_256-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
+; FMA3_256-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
+; FMA3_256-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; FMA3_256-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA3_256-NEXT:    retq
+;
+; FMA3_512-LABEL: mul_subadd_pd128:
+; FMA3_512:       # BB#0: # %entry
+; FMA3_512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
+; FMA3_512-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
+; FMA3_512-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; FMA3_512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA3_512-NEXT:    retq
+;
+; FMA4-LABEL: mul_subadd_pd128:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
+; FMA4-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; FMA4-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; FMA4-NEXT:    retq
+entry:
+  %AB = fmul <2 x double> %A, %B
+  %Sub = fsub <2 x double> %AB, %C
+  %Add = fadd <2 x double> %AB, %C
+  %subadd = shufflevector <2 x double> %Add, <2 x double> %Sub, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %subadd
+}
+
+define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
+; FMA3-LABEL: mul_subadd_ps128:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA3-NEXT:    vsubps %xmm2, %xmm0, %xmm1
+; FMA3-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; FMA3-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; FMA3-NEXT:    retq
+;
+; FMA4-LABEL: mul_subadd_ps128:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; FMA4-NEXT:    vsubps %xmm2, %xmm0, %xmm1
+; FMA4-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; FMA4-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; FMA4-NEXT:    retq
+entry:
+  %AB = fmul <4 x float> %A, %B
+  %Sub = fsub <4 x float> %AB, %C
+  %Add = fadd <4 x float> %AB, %C
+  %subadd = shufflevector <4 x float> %Add, <4 x float> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %subadd
+}
+
+define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
+; FMA3-LABEL: mul_subadd_pd256:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; FMA3-NEXT:    vsubpd %ymm2, %ymm0, %ymm1
+; FMA3-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; FMA3-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; FMA3-NEXT:    retq
+;
+; FMA4-LABEL: mul_subadd_pd256:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    vsubpd %ymm2, %ymm0, %ymm1
+; FMA4-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; FMA4-NEXT:    retq
+entry:
+  %AB = fmul <4 x double> %A, %B
+  %Sub = fsub <4 x double> %AB, %C
+  %Add = fadd <4 x double> %AB, %C
+  %subadd = shufflevector <4 x double> %Add, <4 x double> %Sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %subadd
+}
+
+define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
+; FMA3-LABEL: mul_subadd_ps256:
+; FMA3:       # BB#0: # %entry
+; FMA3-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; FMA3-NEXT:    vsubps %ymm2, %ymm0, %ymm1
+; FMA3-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; FMA3-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; FMA3-NEXT:    retq
+;
+; FMA4-LABEL: mul_subadd_ps256:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; FMA4-NEXT:    vsubps %ymm2, %ymm0, %ymm1
+; FMA4-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; FMA4-NEXT:    retq
+entry:
+  %AB = fmul <8 x float> %A, %B
+  %Sub = fsub <8 x float> %AB, %C
+  %Add = fadd <8 x float> %AB, %C
+  %subadd = shufflevector <8 x float> %Add, <8 x float> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %subadd
+}
+
+define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
+; FMA3_256-LABEL: mul_subadd_pd512:
+; FMA3_256:       # BB#0: # %entry
+; FMA3_256-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA3_256-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA3_256-NEXT:    vsubpd %ymm5, %ymm1, %ymm2
+; FMA3_256-NEXT:    vsubpd %ymm4, %ymm0, %ymm3
+; FMA3_256-NEXT:    vaddpd %ymm5, %ymm1, %ymm1
+; FMA3_256-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
+; FMA3_256-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
+; FMA3_256-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
+; FMA3_256-NEXT:    retq
+;
+; FMA3_512-LABEL: mul_subadd_pd512:
+; FMA3_512:       # BB#0: # %entry
+; FMA3_512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
+; FMA3_512-NEXT:    vsubpd %zmm2, %zmm0, %zmm1
+; FMA3_512-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
+; FMA3_512-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7]
+; FMA3_512-NEXT:    retq
+;
+; FMA4-LABEL: mul_subadd_pd512:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    vsubpd %ymm5, %ymm1, %ymm2
+; FMA4-NEXT:    vsubpd %ymm4, %ymm0, %ymm3
+; FMA4-NEXT:    vaddpd %ymm5, %ymm1, %ymm1
+; FMA4-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
+; FMA4-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
+; FMA4-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
+; FMA4-NEXT:    retq
+entry:
+  %AB = fmul <8 x double> %A, %B
+  %Sub = fsub <8 x double> %AB, %C
+  %Add = fadd <8 x double> %AB, %C
+  %subadd = shufflevector <8 x double> %Add, <8 x double> %Sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x double> %subadd
+}
+
+define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
+; FMA3_256-LABEL: mul_subadd_ps512:
+; FMA3_256:       # BB#0: # %entry
+; FMA3_256-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA3_256-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA3_256-NEXT:    vsubps %ymm5, %ymm1, %ymm2
+; FMA3_256-NEXT:    vsubps %ymm4, %ymm0, %ymm3
+; FMA3_256-NEXT:    vaddps %ymm5, %ymm1, %ymm1
+; FMA3_256-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; FMA3_256-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; FMA3_256-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; FMA3_256-NEXT:    retq
+;
+; FMA3_512-LABEL: mul_subadd_ps512:
+; FMA3_512:       # BB#0: # %entry
+; FMA3_512-NEXT:    vmulps %zmm1, %zmm0, %zmm1
+; FMA3_512-NEXT:    vaddps %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; FMA3_512-NEXT:    kmovw %eax, %k1
+; FMA3_512-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; FMA3_512-NEXT:    retq
+;
+; FMA4-LABEL: mul_subadd_ps512:
+; FMA4:       # BB#0: # %entry
+; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0
+; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; FMA4-NEXT:    vsubps %ymm5, %ymm1, %ymm2
+; FMA4-NEXT:    vsubps %ymm4, %ymm0, %ymm3
+; FMA4-NEXT:    vaddps %ymm5, %ymm1, %ymm1
+; FMA4-NEXT:    vaddps %ymm4, %ymm0, %ymm0
+; FMA4-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
+; FMA4-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; FMA4-NEXT:    retq
+entry:
+  %AB = fmul <16 x float> %A, %B
+  %Sub = fsub <16 x float> %AB, %C
+  %Add = fadd <16 x float> %AB, %C
+  %subadd = shufflevector <16 x float> %Add, <16 x float> %Sub, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x float> %subadd
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index eb06eb75a4d7..d68236e9d250 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -6,10 +6,9 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386--netbsd"
 
 ; CHECK-LABEL: fn1
-; CHECK:       orl  {{.*#+}} 4-byte Folded Reload
-; CHECK:       addl {{.*#+}} 4-byte Folded Reload
-; CHECK:       xorl {{.*#+}} 4-byte Folded Reload
-; CHECK:       xorl {{.*#+}} 4-byte Folded Reload
+; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
+; CHECK:       imull {{.*#+}} 4-byte Folded Reload
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
 ; CHECK:       retl
 
 %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 6c6bc8bdc1d1..98082ec611d4 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
 ; CHECK-NEXT:    andq %rdi, %rcx
 ; CHECK-NEXT:    movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
 ; CHECK-NEXT:    andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    orq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    jmp foo # TAILCALL
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index b5507523a75a..4596b83f7bc2 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -933,14 +933,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
-; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX-NEXT:    vsubss %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX-NEXT:    vsubss %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; AVX-NEXT:    retq
   %vecext = extractelement <4 x float> %A, i32 2
   %vecext1 = extractelement <4 x float> %A, i32 3
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
new file mode 100644
index 000000000000..e3986e47df4d
--- /dev/null
+++ b/test/CodeGen/X86/leaFixup32.mir
@@ -0,0 +1,509 @@
+# RUN: llc -run-pass x86-fixup-LEAs -mtriple=i386 -verify-machineinstrs -mcpu=corei7-avx -o - %s | FileCheck %s
+--- |
+  ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll'
+  source_filename = "test/CodeGen/X86/fixup-lea.ll"
+  target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+  target triple = "i386"
+  ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir
+
+  ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions
+  ; where ADD32ri8 is chosen
+  define i32 @test2add_32() {
+    ret i32 0
+  }
+
+  ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test2add_ebp_32() {
+    ret i32 0
+  }
+
+  ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced
+  ; with an add instruction
+  define i32 @test1add_ebp_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  define i32 @testleaadd_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  ; where the base is ebp register
+  define i32 @testleaadd_ebp_32() {
+    ret i32 0
+  }
+
+  ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced
+  ; with a lea instruction
+  define i32 @test1lea_ebp_32() {
+    ret i32 0
+  }
+ 
+  ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32
+  ; is chosen
+  define i32 @test2addi32_32() {
+    ret i32 0
+  }
+ 
+  ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test1mov1add_ebp_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is offset
+  define i32 @testleaadd_ebp_index_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is scale
+  define i32 @testleaadd_ebp_index2_32() {
+    ret i32 0
+  }
+  
+  ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions
+  define i32 @test_skip_opt_32() {
+    ret i32 0
+  }
+
+  ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags
+  define i32 @test_skip_eflags_32() {
+    ret i32 0
+  }
+
+...
+---
+name:            test2add_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %eax = ADD32rr %eax, killed %ebp
+    ; CHECK: %eax = ADD32ri8 %eax, -5
+ 
+    %eax = LEA32r killed %eax, 1, killed %ebp, -5, _
+    RETQ %eax
+
+...
+---
+name:            test2add_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebp = ADD32rr %ebp, killed %eax
+    ; CHECK: %ebp = ADD32ri8 %ebp, -5
+ 
+    %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _
+    RETQ %ebp
+
+...
+---
+name:            test1add_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebp = ADD32rr %ebp, killed %eax
+ 
+    %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _
+    RETQ %ebp
+
+...
+---
+name:            testleaadd_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+  - { reg: '%ebx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %esi
+    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0
+    ; CHECK: %ebx = ADD32ri8 %ebx, -5
+ 
+    %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+  - { reg: '%ebx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+    ; CHECK: %ebx = ADD32ri8  %ebx, -5
+ 
+    %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _
+    RETQ %ebx
+
+...
+---
+name:            test1lea_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+  - { reg: '%ebx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+ 
+    %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _
+    RETQ %ebx
+
+...
+---
+name:            test2addi32_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp
+    ; CHECK: %eax = ADD32rr %eax, killed %ebp
+    ; CHECK: %eax = ADD32ri %eax, 129
+ 
+    %eax = LEA32r killed %eax, 1, killed %ebp, 129, _
+    RETQ %eax
+
+...
+---
+name:            test1mov1add_ebp_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%eax' }
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = MOV32rr  %ebp
+    ; CHECK: %ebx = ADD32rr %ebx, %ebp
+ 
+    %ebx = LEA32r %ebp, 1, %ebp, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_ebp_index_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA32r _, 1, %ebp, 5, _
+    ; CHECK: %ebx = ADD32rr %ebx, %ebp
+ 
+    %ebx = LEA32r %ebp, 1, %ebp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_ebp_index2_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA32r _, 4, %ebp, 5, _
+    ; CHECK: %ebx = ADD32rr %ebx, %ebp
+ 
+    %ebx = LEA32r %ebp, 4, %ebp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            test_skip_opt_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebx' }
+  - { reg: '%ebp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+ 
+    %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_eflags_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%ebp' }
+  - { reg: '%eax' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+    ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _
+    ; CHECK: %ebp = ADD32ri8 %ebp, 5
+   
+    CMP32rr   %eax, killed %ebx, implicit-def %eflags
+    %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+    JE_1 %bb.1, implicit %eflags
+    RETQ %ebx
+  bb.1:
+    liveins: %eax, %ebp, %ebx
+    %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _
+    RETQ %ebp
+
+...
+
+
+
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
new file mode 100644
index 000000000000..b35dee181a47
--- /dev/null
+++ b/test/CodeGen/X86/leaFixup64.mir
@@ -0,0 +1,1041 @@
+# RUN: llc -run-pass x86-fixup-LEAs -mtriple=x86_64-gnu-unknown -verify-machineinstrs -mcpu=corei7-avx -o - %s | FileCheck %s
+--- |
+  ; ModuleID = 'lea-2.ll'
+  source_filename = "lea-2.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir
+
+  ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
+  ; but can be replaced with 1 lea + 1 add
+  define i32 @testleaadd_64_32_1() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
+  ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add
+  define i32 @testleaadd_rbp_64_32_1() {
+    ret i32 0
+  }
+
+  ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not
+  ; be replaced with an add instruction but can be replaced with 1 lea instruction
+  define i32 @test1lea_rbp_64_32_1() {
+    ret i32 0
+  }
+
+  ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions
+  define i32 @test2add_64() {
+    ret i32 0
+  }
+
+  ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test2add_rbp_64() {
+    ret i32 0
+  }
+
+  ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced
+  ; with an add instruction
+  define i32 @test1add_rbp_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
+  define i32 @testleaadd_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @testleaadd_rbp_64_32() {
+    ret i32 0
+  }
+
+  ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced
+  ; with a lea instruction
+  define i32 @test1lea_rbp_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  define i32 @testleaadd_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @testleaadd_rbp_64() {
+    ret i32 0
+  }
+
+  ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced
+  ; with a lea instruction
+  define i32 @test1lea_rbp_64() {
+    ret i32 0
+  }
+
+  ;test8: dst = base & scale!=1, can't optimize
+  define i32 @test8() {
+      ret i32 0
+  }
+ 
+  ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where
+  ; ADD64ri32 is chosen
+  define i32 @testleaaddi32_64_32() {
+    ret i32 0
+  }
+ 
+  ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test1mov1add_rbp_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is offset
+  define i32 @testleaadd_rbp_index_64_32() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is scale
+  define i32 @testleaadd_rbp_index2_64_32() {
+    ret i32 0
+  }
+ 
+  ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32
+  ; is chosen
+  define i32 @test2addi32_64() {
+    ret i32 0
+  }
+ 
+  ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions
+  ; where the base is rbp/r13/ebp register
+  define i32 @test1mov1add_rbp_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is offset
+  define i32 @testleaadd_rbp_index_64() {
+    ret i32 0
+  }
+
+  ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+  ; where the base and the index are ebp register and there is scale
+  define i32 @testleaadd_rbp_index2_64() {
+    ret i32 0
+  }
+
+  ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions
+  define i32 @test_skip_opt_64() {
+    ret i32 0
+  }
+
+  ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags
+  define i32 @test_skip_eflags_64() {
+    ret i32 0
+  }
+
+  ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions
+  define i32 @test_skip_opt_64_32() {
+    ret i32 0
+  }
+
+  ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags
+  define i32 @test_skip_eflags_64_32() {
+    ret i32 0
+  }
+
+
+...
+---
+name:            testleaadd_64_32_1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
+    ; CHECK: %eax = ADD32ri8 %eax, -5
+ 
+    %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+    RETQ %eax
+
+...
+---
+name:            testleaadd_rbp_64_32_1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebp = LEA64_32r killed %rax, 1,  killed %rbp, 0
+    ; CHECK: %ebp = ADD32ri8 %ebp, -5
+ 
+    %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebp
+
+...
+---
+name:            test1lea_rbp_64_32_1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
+ 
+    %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test2add_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rax = ADD64rr %rax, killed %rbp
+    ; CHECK: %rax = ADD64ri8 %rax, -5
+ 
+    %rax = LEA64r killed %rax, 1, killed %rbp, -5, _
+    RETQ %eax
+
+...
+---
+name:            test2add_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbp = ADD64rr %rbp, killed %rax
+    ; CHECK: %rbp = ADD64ri8 %rbp, -5
+ 
+    %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebp
+
+...
+---
+name:            test1add_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbp = ADD64rr %rbp, killed %rax
+ 
+    %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebp
+
+...
+---
+name:            testleaadd_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %ebx = ADD32ri8 %ebx, -5
+ 
+    %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %ebx = ADD32ri8 %ebx, -5
+ 
+    %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebx
+
+...
+---
+name:            test1lea_rbp_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ 
+    %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %rbx = ADD64ri8 %rbx, -5
+ 
+    %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+    ; CHECK: %rbx = ADD64ri8 %rbx, -5
+ 
+    %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _
+    RETQ %ebx
+
+...
+---
+name:            test1lea_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+  - { reg: '%rbx' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ 
+    %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _
+    RETQ %ebx
+
+...
+---
+name:            test8
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rdi' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rdi, %rbp
+    ; CHECK:  %r12 = LEA64r _, 2, killed %r13, 5, _
+    ; CHECK:  %r12 = ADD64rr %r12, killed %rbp
+    %rbp = KILL %rbp, implicit-def %rbp
+    %r13 = KILL %rdi, implicit-def %r13
+    %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _
+    RETQ %r12
+
+...
+---
+name:            testleaaddi32_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
+    ; CHECK: %eax = ADD32ri %eax, 129
+ 
+    %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _
+    RETQ %eax
+
+...
+---
+name:            test1mov1add_rbp_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+
+    %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+ 
+    %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index2_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %eax, %ebp, %ebx
+    ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+ 
+    %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            test2addi32_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp
+    ; CHECK: %rax = ADD64rr %rax, killed %rbp
+    ; CHECK: %rax = ADD64ri32 %rax, 129
+ 
+    %rax = LEA64r killed %rax, 1, killed %rbp, 129, _
+    RETQ %eax
+
+...
+---
+name:            test1mov1add_rbp_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rax' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = MOV64rr %rbp
+    ; CHECK: %rbx = ADD64rr %rbx, %rbp
+ 
+    %rbx = LEA64r %rbp, 1, %rbp, 0, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = LEA64r _, 1,  %rbp, 5, _
+    ; CHECK: %rbx = ADD64rr %rbx, %rbp
+ 
+    %rbx = LEA64r %rbp, 1, %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            testleaadd_rbp_index2_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = LEA64r _, 4, %rbp, 5, _
+    ; CHECK: %rbx = ADD64rr %rbx,  %rbp
+ 
+    %rbx = LEA64r %rbp, 4,  %rbp, 5, _
+    RETQ %ebx
+
+...
+---
+name:            test_skip_opt_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+ 
+    %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_eflags_64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbp' }
+  - { reg: '%rax' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+    ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _
+    ; CHECK: %rbp = ADD64ri8 %rbp, 5
+   
+    CMP64rr   %rax, killed %rbx, implicit-def %eflags
+    %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+    JE_1 %bb.1, implicit %eflags
+    RETQ %ebx
+  bb.1:
+    liveins: %rax, %rbp, %rbx
+    %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_opt_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbx' }
+  - { reg: '%rbp' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+ 
+    %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+    RETQ %ebp
+
+...
+---
+name:            test_skip_eflags_64_32
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rbp' }
+  - { reg: '%rax' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %rax, %rbp, %rbx
+    ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+    ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _
+    ; CHECK: %ebp = ADD32ri8 %ebp, 5
+   
+    CMP64rr   %rax, killed %rbx, implicit-def %eflags
+    %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+    JE_1 %bb.1, implicit %eflags
+    RETQ %ebx
+  bb.1:
+    liveins: %rax, %rbp, %rbx
+    %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _
+    RETQ %ebp
+
+...
+
+
+
diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll
deleted file mode 100644
index a9cf086dbd90..000000000000
--- a/test/CodeGen/X86/lrshrink.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-
-; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call
-; to minimize live-range.
-
-define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) {
-entry:
-  br i1 %a, label %then, label %else
-
-then:
-  br label %else
-
-else:
-  %0 = phi i64 [ 4, %entry ], [ 10, %then ]
-  %r = phi i64 [ %r1, %entry ], [ %r2, %then ]
-  %s = phi i64 [ %s1, %entry ], [ %s2, %then ]
-  %t = phi i64 [ %t1, %entry ], [ %t2, %then ]
-; CHECK-LABEL: test:
-; CHECK: add
-; CHECK: add
-; CHECK: call
-; CHECK: add
-; CHECK: call
-; CHECK: add
-; CHECK: call
-; CHECK: add
-  %1 = tail call i32 @_Z3foov()
-  %2 = zext i32 %1 to i64
-  %3 = tail call i32 @_Z3foov()
-  %4 = zext i32 %3 to i64
-  %5 = tail call i32 @_Z3foov()
-  %6 = zext i32 %5 to i64
-  %7 = add nuw nsw i64 %0, %r
-  tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
-  %8 = add nuw nsw i64 %2, %7
-  %9 = add nuw nsw i64 %4, %8
-  %10 = add nuw nsw i64 %6, %9
-  %11 = add nuw nsw i64 %s, %t
-  tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
-  %12 = add nuw nsw i64 %10, %11
-  ret i64 %12
-}
-
-declare i32 @_Z3foov()
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!1, !2}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug)
-!1 = !{i32 2, !"Dwarf Version", i32 4}
-!2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = !DIFile(filename: "a.c", directory: "./")
-!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0)
-!5 = !DILocalVariable(name: "x", scope: !4)
-!6 = !DILocation(line: 4, scope: !4)
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index af86df510016..d332b2f3169f 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    addq $16, %rsi
 ; SSE2-NEXT:    addq $16, %rdi
 ; SSE2-NEXT:    addq $-8, %rax
@@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
 ; SSE2-NEXT:    pmullw %xmm4, %xmm5
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    pmullw %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    paddd %xmm7, %xmm2
+; SSE2-NEXT:    paddd %xmm6, %xmm3
+; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm4
-; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    pmullw %xmm4, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    psrad $16, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm2
 ; SSE2-NEXT:    addq $16, %rsi
 ; SSE2-NEXT:    addq $16, %rdi
 ; SSE2-NEXT:    addq $-16, %rax
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index c5de8dd96cbc..91087f650ad6 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -300,8 +300,8 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
 ;
 ; KNL_32-LABEL: test6:
 ; KNL_32:       # BB#0:
-; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
 ; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm2
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
 ; KNL_32-NEXT:    vpgatherqd (,%zmm2), %ymm1 {%k2}
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm2) {%k1}
@@ -1575,7 +1575,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
 ; Check non-power-of-2 case. It should be scalarized.
 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
-; ALL-LABEL: test30:
+; ALL-LABEL: test30
 ; ALL-NOT:       gather
 
   %sext_ind = sext <3 x i32> %ind to <3 x i64>
@@ -1691,12 +1691,12 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ; KNL_32-LABEL: test_gather_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi4:
+; KNL_32-NEXT:  .Lcfi0:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi5:
+; KNL_32-NEXT:  .Lcfi1:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi6:
+; KNL_32-NEXT:  .Lcfi2:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1814,12 +1814,12 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ; KNL_32-LABEL: test_gather_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi7:
+; KNL_32-NEXT:  .Lcfi3:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi8:
+; KNL_32-NEXT:  .Lcfi4:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi9:
+; KNL_32-NEXT:  .Lcfi5:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1936,12 +1936,12 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; KNL_32-LABEL: test_scatter_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi10:
+; KNL_32-NEXT:  .Lcfi6:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi11:
+; KNL_32-NEXT:  .Lcfi7:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi12:
+; KNL_32-NEXT:  .Lcfi8:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -2058,12 +2058,12 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; KNL_32-LABEL: test_scatter_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi13:
+; KNL_32-NEXT:  .Lcfi9:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi14:
+; KNL_32-NEXT:  .Lcfi10:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi15:
+; KNL_32-NEXT:  .Lcfi11:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -2139,12 +2139,12 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_32-LABEL: test_pr28312:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi16:
+; KNL_32-NEXT:  .Lcfi12:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi17:
+; KNL_32-NEXT:  .Lcfi13:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi18:
+; KNL_32-NEXT:  .Lcfi14:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-32, %esp
 ; KNL_32-NEXT:    subl $32, %esp
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 2f7714e63886..71417694b0d4 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -270,9 +270,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_4f32_f32_012u:
@@ -292,9 +292,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_012u:
@@ -321,9 +321,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: merge_4f32_f32_019u:
@@ -343,9 +343,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_019u:
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index 94bbe75702cb..e62a1d04dad6 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -17,9 +17,9 @@
 ;
 ; TOPDOWN-LABEL: %for.body
 ; TOPDOWN: movl %{{.*}}, (
-; TOPDOWN-NOT: imull {{[0-9]*}}(
+; TOPDOWN: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 4(
-; TOPDOWN-NOT: imull {{[0-9]*}}(
+; TOPDOWN: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 8(
 ; TOPDOWN: movl %{{.*}}, 12(
 ; TOPDOWN-LABEL: %for.end
diff --git a/test/CodeGen/X86/mul-i1024.ll b/test/CodeGen/X86/mul-i1024.ll
index 340aa047c022..87661004373f 100644
--- a/test/CodeGen/X86/mul-i1024.ll
+++ b/test/CodeGen/X86/mul-i1024.ll
@@ -11,7 +11,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $2640, %esp # imm = 0xA50
+; X32-NEXT:    subl $2632, %esp # imm = 0xA48
 ; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    movl 64(%eax), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -58,7 +58,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl 20(%eax), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl 24(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl 28(%eax), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl 32(%eax), %ecx
@@ -1992,7 +1992,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -2002,23 +2002,19 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    movl $0, %ebx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
@@ -2035,8 +2031,14 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -2045,157 +2047,144 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %eax, %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    addl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %dl
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movzbl %dl, %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl %esi, %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    addl %eax, %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    adcl %eax, %esi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -2215,16 +2204,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %edx, %edx
-; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
@@ -2246,7 +2234,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -2268,16 +2256,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
@@ -2306,112 +2293,97 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %eax, %edx
-; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl %edi, %edx
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movzbl %al, %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %eax
+; X32-NEXT:    addl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -2429,16 +2401,14 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl $0, %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2447,16 +2417,16 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
-; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %eax
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2467,10 +2437,10 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, %edx
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl %edi, %eax
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
@@ -2485,52 +2455,50 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %eax, %edx
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    adcl %esi, %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2550,16 +2518,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
@@ -2582,7 +2549,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
@@ -2603,16 +2570,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
@@ -2639,105 +2605,88 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edx, %eax
-; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %eax, %edx
-; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    adcl %ecx, %edi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movzbl %al, %ebx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl %eax, %esi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -2766,76 +2715,70 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    setb %dl
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movzbl %dl, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %eax, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %edi
-; X32-NEXT:    adcl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
@@ -2847,55 +2790,53 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %ebx
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2915,20 +2856,19 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
@@ -2968,16 +2908,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
@@ -3004,109 +2943,87 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edx, %eax
-; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %eax, %edx
-; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl %al, %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    adcl %eax, %esi
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -3127,56 +3044,33 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
@@ -3214,37 +3108,35 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    adcl %eax, %ecx
-; X32-NEXT:    movl $0, %ebx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    adcl %eax, %esi
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3264,38 +3156,37 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %edi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
 ; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    adcl %ebx, %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
@@ -3319,15 +3210,14 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
@@ -3337,118 +3227,113 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    addl %edx, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    adcl %edx, %edi
+; X32-NEXT:    setb %cl
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, %edx
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -3462,36 +3347,34 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
@@ -3500,18 +3383,17 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %eax, %edi
-; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl %eax, %esi
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
@@ -3555,14 +3437,13 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
@@ -3612,136 +3493,120 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %eax
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    movl %edi, %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %ebx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %esi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl %edx, %edi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %ebx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -3760,16 +3625,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -3786,79 +3650,80 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    addl %eax, %edx
-; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -3877,35 +3742,35 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl %edx, %eax
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    setb %dl
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl %dl, %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %ecx, %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3925,44 +3790,43 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ecx, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    adcl %esi, %edi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
@@ -3986,15 +3850,14 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl %eax, %edx
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
@@ -4007,10 +3870,10 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    addl %edx, %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, %edx
+; X32-NEXT:    adcl %ebx, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -4025,116 +3888,107 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %ebx
-; X32-NEXT:    adcl %ecx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    adcl %edx, %edi
+; X32-NEXT:    movl %ebx, %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -4156,10 +4010,10 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
@@ -4168,45 +4022,46 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl %edx, %eax
-; X32-NEXT:    movl $0, %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    sbbl %edi, %edi
-; X32-NEXT:    andl $1, %edi
+; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    setb %cl
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl %edx, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -4216,15 +4071,16 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -4236,25 +4092,21 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
@@ -4264,6 +4116,11 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -4278,13 +4135,13 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
@@ -4292,10 +4149,6 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
@@ -4304,6 +4157,10 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
@@ -4312,67 +4169,66 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 16(%ebp), %ebx
-; X32-NEXT:    movl %ecx, 4(%ebx)
-; X32-NEXT:    movl 16(%ebp), %ecx
-; X32-NEXT:    movl %eax, (%ecx)
+; X32-NEXT:    movl 16(%ebp), %edx
+; X32-NEXT:    movl %ecx, 4(%edx)
+; X32-NEXT:    movl %eax, (%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 8(%ecx)
+; X32-NEXT:    movl %eax, 8(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 12(%ecx)
+; X32-NEXT:    movl %eax, 12(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 16(%ecx)
+; X32-NEXT:    movl %eax, 16(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 20(%ecx)
+; X32-NEXT:    movl %eax, 20(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 24(%ecx)
+; X32-NEXT:    movl %eax, 24(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 28(%ecx)
+; X32-NEXT:    movl %eax, 28(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 32(%ecx)
+; X32-NEXT:    movl %eax, 32(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 36(%ecx)
+; X32-NEXT:    movl %eax, 36(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 40(%ecx)
+; X32-NEXT:    movl %eax, 40(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 44(%ecx)
+; X32-NEXT:    movl %eax, 44(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 48(%ecx)
+; X32-NEXT:    movl %eax, 48(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 52(%ecx)
+; X32-NEXT:    movl %eax, 52(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 56(%ecx)
+; X32-NEXT:    movl %eax, 56(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 60(%ecx)
+; X32-NEXT:    movl %eax, 60(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 64(%ecx)
+; X32-NEXT:    movl %eax, 64(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 68(%ecx)
+; X32-NEXT:    movl %eax, 68(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 72(%ecx)
+; X32-NEXT:    movl %eax, 72(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 76(%ecx)
+; X32-NEXT:    movl %eax, 76(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 80(%ecx)
+; X32-NEXT:    movl %eax, 80(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 84(%ecx)
+; X32-NEXT:    movl %eax, 84(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 88(%ecx)
+; X32-NEXT:    movl %eax, 88(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 92(%ecx)
+; X32-NEXT:    movl %eax, 92(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 96(%ecx)
-; X32-NEXT:    movl %edx, 100(%ecx)
+; X32-NEXT:    movl %eax, 96(%edx)
+; X32-NEXT:    movl %esi, 100(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 104(%ecx)
-; X32-NEXT:    movl %esi, 108(%ecx)
+; X32-NEXT:    movl %eax, 104(%edx)
+; X32-NEXT:    movl %edi, 108(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 112(%ecx)
-; X32-NEXT:    movl %edi, 116(%ecx)
+; X32-NEXT:    movl %eax, 112(%edx)
+; X32-NEXT:    movl %ebx, 116(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 120(%ecx)
+; X32-NEXT:    movl %eax, 120(%edx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 124(%ecx)
+; X32-NEXT:    movl %eax, 124(%edx)
 ; X32-NEXT:    leal -12(%ebp), %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
@@ -4390,40 +4246,41 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    subq $352, %rsp # imm = 0x160
 ; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq 48(%rdi), %r8
-; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq 40(%rdi), %rcx
-; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq 48(%rdi), %r9
+; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq 40(%rdi), %rbp
+; X64-NEXT:    movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq 32(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdi, %r13
-; X64-NEXT:    xorl %r9d, %r9d
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rbx, %rcx
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rdi, %rbx
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %r11, %rcx
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %rbp
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    adcq %rdi, %rbp
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movzbl %bl, %ebx
 ; X64-NEXT:    addq %rax, %rbp
 ; X64-NEXT:    adcq %rdx, %rbx
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %r11, %r12
+; X64-NEXT:    movq %r11, %r8
 ; X64-NEXT:    addq %rax, %r12
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    movq %r9, (%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rdx, %rax
 ; X64-NEXT:    addq %rbp, %r12
 ; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
@@ -4433,186 +4290,182 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    xorl %ebp, %ebp
 ; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq 8(%rsi), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %rbp
-; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    xorl %r11d, %r11d
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rcx, %r15
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %r10, %r15
+; X64-NEXT:    addq %rdi, %r15
 ; X64-NEXT:    adcq %rcx, %rbp
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    movzbl %bl, %ebx
 ; X64-NEXT:    adcq %rdx, %rbx
 ; X64-NEXT:    movq 16(%rsi), %rax
-; X64-NEXT:    movq %rsi, %r14
-; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %r13
+; X64-NEXT:    movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rcx
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    adcq %rdx, %r9
-; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    adcq %rbx, %r9
-; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rdi, %r8
-; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r13, %rbp
-; X64-NEXT:    movq (%rbp), %rax
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    addq %rax, %r14
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    adcq %rdx, %r11
+; X64-NEXT:    addq %rbp, %r14
+; X64-NEXT:    adcq %rbx, %r11
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, %rbp
+; X64-NEXT:    movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    adcq %rcx, %rax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq (%r10), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r9
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    adcq %rdi, %rax
+; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq 32(%r14), %rax
+; X64-NEXT:    movq 32(%r13), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r14
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    adcq %rdx, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    adcq %r15, %rax
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %r12
-; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rcx
+; X64-NEXT:    adcq %r14, %r12
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    adcq %r9, %rax
+; X64-NEXT:    adcq %r11, %rax
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r9, %r10
-; X64-NEXT:    movq 8(%rbp), %rax
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    movq 8(%r10), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rbp, %rdi
-; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    mulq %rdx
-; X64-NEXT:    xorl %r9d, %r9d
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rsi, %r12
+; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rsi, %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %r14, %r12
+; X64-NEXT:    addq %rcx, %r11
 ; X64-NEXT:    adcq %rsi, %rbp
 ; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    movzbl %bl, %ebx
 ; X64-NEXT:    adcq %rdx, %rbx
-; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%r10), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    adcq %rdx, %r10
+; X64-NEXT:    addq %rbp, %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %rcx, %r12
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    addq %r9, %rdx
 ; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r14, %r9
-; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq %rbp, %r9
-; X64-NEXT:    movq %r9, %rdx
-; X64-NEXT:    adcq %rbx, %rsi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %r15
+; X64-NEXT:    movq %r11, %r8
+; X64-NEXT:    adcq %r8, %r15
 ; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    adcq %rax, %r14
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    adcq %r10, %rdi
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    movq 40(%rsi), %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    xorl %r8d, %r8d
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    xorl %r14d, %r14d
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
+; X64-NEXT:    addq %r9, %rdi
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %r13, %rcx
-; X64-NEXT:    adcq %rdi, %rbp
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
+; X64-NEXT:    addq %r13, %rdi
+; X64-NEXT:    adcq %r9, %rbp
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    addq %rax, %rbp
-; X64-NEXT:    adcq %rdx, %rbx
+; X64-NEXT:    movzbl %bl, %r11d
+; X64-NEXT:    adcq %rdx, %r11
 ; X64-NEXT:    movq 48(%rsi), %rax
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r13, %r8
-; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %rbp, %r8
-; X64-NEXT:    adcq %rbx, %rax
-; X64-NEXT:    movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    addq %r13, %r14
-; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rcx, %r12
+; X64-NEXT:    movq %r13, %rbx
+; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    addq %rbp, %rbx
+; X64-NEXT:    adcq %r11, %rsi
+; X64-NEXT:    movq %r13, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    addq %r13, %r12
 ; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r8, %r15
-; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rax, %r10
+; X64-NEXT:    adcq %rdi, %r8
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rbx, %rcx
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rdi, %rax
+; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT:    adcq %r9, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
-; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    movq 56(%rax), %r11
 ; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    movq %rdi, %r10
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rsi, %rbx
@@ -4621,15 +4474,15 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rbx, %r10
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rbx, %r8
 ; X64-NEXT:    adcq %rbp, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movq %rdi, %r11
 ; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15 # 8-byte Reload
 ; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
@@ -4637,299 +4490,297 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %r15
 ; X64-NEXT:    adcq %rdx, %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r10, %rbp
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rcx
-; X64-NEXT:    sbbq %rbp, %rbp
-; X64-NEXT:    andl $1, %ebp
-; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    adcq %rbp, %r13
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    adcq %rax, %r13
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    adcq %r10, %r13
+; X64-NEXT:    addq %r9, %rsi
+; X64-NEXT:    adcq %r8, %r13
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %rdi, %rbp
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r10, %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r14
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    movq 24(%rax), %rcx
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %rbp
-; X64-NEXT:    movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rbx, %r8
+; X64-NEXT:    addq %rbp, %r8
 ; X64-NEXT:    adcq %rdi, %rcx
-; X64-NEXT:    sbbq %rdi, %rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload
-; X64-NEXT:    adcq %r14, %rbp
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    adcq %rdx, %rbp
+; X64-NEXT:    addq %r14, %rbp
+; X64-NEXT:    movq (%rsp), %rbx # 8-byte Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
+; X64-NEXT:    adcq %r9, %rbx
+; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    adcq %rdx, %rbx
 ; X64-NEXT:    addq %rsi, %r10
 ; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %r13, %r8
 ; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %r15, %rbx
-; X64-NEXT:    adcq %r12, %rbp
-; X64-NEXT:    movl $0, %r8d
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    sbbq %r10, %r10
-; X64-NEXT:    andl $1, %r10d
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    addq %r15, %rbp
+; X64-NEXT:    adcq %r12, %rbx
+; X64-NEXT:    setb %r15b
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    movq %r11, %rsi
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r9, %rdi
+; X64-NEXT:    addq %r11, %rdi
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdi, %r13
-; X64-NEXT:    adcq %rsi, %r9
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r12
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    adcq %r14, %rsi
+; X64-NEXT:    addq %r14, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload
+; X64-NEXT:    adcq %r9, %r14
 ; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq %rbx, %r15
-; X64-NEXT:    adcq %rbp, %r13
-; X64-NEXT:    adcq %r8, %rcx
-; X64-NEXT:    adcq %r10, %rsi
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
-; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload
+; X64-NEXT:    adcq %rdx, %r14
+; X64-NEXT:    addq %rbp, %r13
+; X64-NEXT:    adcq %rbx, %r11
+; X64-NEXT:    movzbl %r15b, %eax
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r13, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
+; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
+; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    movq 24(%rax), %rbp
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq 24(%rax), %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    movq %rsi, %r11
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rbx, %rbp
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rbx, %r15
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    addq %rbp, %r15
+; X64-NEXT:    adcq %rsi, %rbx
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8 # 8-byte Reload
 ; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %r8
 ; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r11, %rbp
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    adcq %rbx, %r11
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
-; X64-NEXT:    addq %r9, %rsi
-; X64-NEXT:    adcq %r15, %r11
+; X64-NEXT:    adcq %rbp, %rdi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, %rbp
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %rsi
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    addq %r14, %rbx
+; X64-NEXT:    adcq %r15, %rsi
 ; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, %r14
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdi, %r15
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r12, %r13
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    adcq %rdi, %rcx
-; X64-NEXT:    sbbq %rdi, %rdi
-; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    addq %r13, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload
-; X64-NEXT:    addq %r14, %rbx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %rsi, %r9
+; X64-NEXT:    adcq %r14, %rbp
+; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    adcq %rdx, %rbp
+; X64-NEXT:    addq %rbx, %r9
 ; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r11, %rbp
-; X64-NEXT:    movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    adcq %r10, %rcx
-; X64-NEXT:    movl $0, %r12d
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    sbbq %r9, %r9
-; X64-NEXT:    andl $1, %r9d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    addq %r8, %rdi
+; X64-NEXT:    adcq %r10, %rbp
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r8, %rdi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r10, %rbx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq %rsi, %r8
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, %rbp
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    addq %rbx, %r15
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movzbl %bl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rsi
-; X64-NEXT:    addq %r14, %rsi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rdi
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    adcq %rdx, %rdi
-; X64-NEXT:    addq %rbx, %r11
-; X64-NEXT:    adcq %rcx, %r15
-; X64-NEXT:    adcq %r12, %rsi
-; X64-NEXT:    adcq %r9, %rdi
+; X64-NEXT:    movq %r10, %rcx
+; X64-NEXT:    addq %r13, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    adcq %r14, %rsi
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    adcq %rbp, %r15
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq $0, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    adcq $0, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
@@ -4937,9 +4788,10 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %r8, %rbp
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r13
+; X64-NEXT:    movq %rcx, %r11
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %rsi, %rcx
@@ -4948,291 +4800,279 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq %rbx, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdi, %r15
 ; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq %r10, %r9
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    adcq (%rsp), %r12 # 8-byte Folded Reload
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r9 # 8-byte Folded Reload
+; X64-NEXT:    movq %r12, %r10
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    adcq %rdx, %r12
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r13, %rbp
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    adcq %rdx, %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r8, %rbx
+; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %rbp
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    adcq %rax, %r15
 ; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
 ; X64-NEXT:    addq %r14, %rbx
-; X64-NEXT:    adcq %r11, %r8
+; X64-NEXT:    adcq %r8, %r15
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %rbp, %rsi
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r8
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %r14, %rcx
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    movq 56(%rax), %r15
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq 56(%rax), %rdi
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rcx, %r14
+; X64-NEXT:    adcq %rbp, %rsi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %r8
 ; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    addq %r13, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; X64-NEXT:    addq %r11, %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
-; X64-NEXT:    adcq %rbp, %rsi
-; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    adcq %r13, %rsi
+; X64-NEXT:    addq %rax, %rcx
 ; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq %rbx, %r10
-; X64-NEXT:    movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r8, %r11
-; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq %rbx, %r12
+; X64-NEXT:    adcq %r15, %r14
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    addq %r9, %rdi
-; X64-NEXT:    adcq %r12, %rsi
-; X64-NEXT:    movl $0, %r14d
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    sbbq %r10, %r10
-; X64-NEXT:    andl $1, %r10d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r8, %rcx
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    addq %r9, %rcx
+; X64-NEXT:    adcq %r10, %rsi
+; X64-NEXT:    setb {{[0-9]+}}(%rsp) # 1-byte Folded Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq %r11, %r8
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %r8, %rdi
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rbx, %r8
+; X64-NEXT:    adcq %r15, %r9
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movzbl %bl, %edi
+; X64-NEXT:    adcq %rdi, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15 # 8-byte Reload
-; X64-NEXT:    addq %r13, %r15
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
-; X64-NEXT:    adcq %rbp, %r11
+; X64-NEXT:    addq %r11, %r15
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    adcq %r13, %rbp
 ; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    addq %rdi, %r12
-; X64-NEXT:    adcq %rsi, %rbx
-; X64-NEXT:    adcq %r14, %r15
-; X64-NEXT:    adcq %r10, %r11
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r12 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq %rdx, %rbp
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax # 1-byte Folded Reload
+; X64-NEXT:    adcq %rax, %r15
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    addq %rcx, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %rcx, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %rcx, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %rcx, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r12 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    addq %rax, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    adcq %rax, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
-; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    sbbq %rax, %rax
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
+; X64-NEXT:    movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT:    setb -{{[0-9]+}}(%rsp) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r8, %rbp
-; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rbp, %r12
-; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    sbbq %rbp, %rbp
-; X64-NEXT:    andl $1, %ebp
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    addq %rbx, %r12
+; X64-NEXT:    adcq %rdi, %rcx
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r9
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq (%rsp), %r10 # 8-byte Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload
-; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r14, %rbx
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rbp, %rax
+; X64-NEXT:    movzbl %bl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    adcq %rdx, %rcx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    sbbq %rdi, %rdi
-; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    adcq %rdi, %rbx
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    addq %r13, %rbp
-; X64-NEXT:    adcq %r12, %rbx
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r9, %r12
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r10, %r8
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %r13
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rdi, %rcx
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    adcq %rax, %r11
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
+; X64-NEXT:    adcq %r12, %r11
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r13, %rbx
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    adcq %rsi, %r14
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, %r9
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %r8, %rcx
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    adcq %rsi, %rbx
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload
-; X64-NEXT:    addq %r14, %rsi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    adcq %r13, %rcx
+; X64-NEXT:    adcq %r14, %rcx
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %rbp, %r9
-; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %rdi
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    addq %rdi, %r12
+; X64-NEXT:    adcq %r11, %r8
+; X64-NEXT:    movq %r8, %r11
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    addq %r12, %rsi
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r8, %rcx
-; X64-NEXT:    movq %rcx, %r12
-; X64-NEXT:    movl $0, %r10d
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    sbbq %r9, %r9
-; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; X64-NEXT:    setb -{{[0-9]+}}(%rsp) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq %r10, %rsi
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
@@ -5244,82 +5084,85 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rdi, %r10
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movzbl %bl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    addq %r14, %rsi
+; X64-NEXT:    addq %r13, %rsi
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %r13, %rcx
+; X64-NEXT:    adcq %r14, %rcx
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r12, %rbx
-; X64-NEXT:    adcq %r10, %rsi
-; X64-NEXT:    adcq %r9, %rcx
-; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
+; X64-NEXT:    adcq (%rsp), %r10 # 8-byte Folded Reload
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax # 1-byte Folded Reload
+; X64-NEXT:    adcq %rax, %rsi
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    addq %rax, (%rsp) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rax, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq %r15, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq %r11, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload
-; X64-NEXT:    movq %r13, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    addq %rax, {{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    adcq %rax, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    adcq %r15, %r12
+; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rbp, %r11
+; X64-NEXT:    movq %r11, (%rsp) # 8-byte Spill
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax # 1-byte Folded Reload
+; X64-NEXT:    adcq %rax, %r14
+; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
-; X64-NEXT:    movq 64(%r9), %r11
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq 64(%rcx), %r11
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq 72(%r9), %rcx
+; X64-NEXT:    movq 72(%rcx), %rsi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rbx, %r8
-; X64-NEXT:    adcq %rbp, %rcx
-; X64-NEXT:    sbbq %rbp, %rbp
-; X64-NEXT:    andl $1, %ebp
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    adcq %rbp, %rsi
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %r10
 ; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %rbp, %rsi
+; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    adcq %rax, %rcx
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r12 # 8-byte Reload
@@ -5327,7 +5170,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload
 ; X64-NEXT:    adcq %r14, %r15
 ; X64-NEXT:    addq %rdi, %r12
-; X64-NEXT:    adcq %rsi, %r15
+; X64-NEXT:    adcq %rcx, %r15
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq %r11, %rsi
@@ -5335,8 +5178,8 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
@@ -5349,165 +5192,159 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    adcq %r13, %r14
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %rbx
 ; X64-NEXT:    adcq %rdx, %r14
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    addq %r13, %rbx
 ; X64-NEXT:    adcq %r8, %r14
 ; X64-NEXT:    adcq $0, %r12
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r9, %rbp
-; X64-NEXT:    movq 80(%rbp), %r8
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq 80(%rbp), %rdi
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, %r9
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r10, %rcx
+; X64-NEXT:    addq %r8, %rcx
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq 88(%rbp), %r10
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rbp, %rcx
-; X64-NEXT:    adcq %rsi, %rdi
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rbp, %rsi
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    mulq %rdx
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    addq %r9, %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    adcq %rdi, %rax
-; X64-NEXT:    addq %rbx, %r11
-; X64-NEXT:    movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r9
-; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq %rsi, %rbp
+; X64-NEXT:    adcq %rcx, %rax
+; X64-NEXT:    addq %rbx, %r13
+; X64-NEXT:    movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %r8
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    addq %r12, %rsi
-; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    addq %r12, %rbp
+; X64-NEXT:    movq %rbp, %r8
 ; X64-NEXT:    adcq %r15, %rax
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movl $0, %r12d
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    sbbq %r11, %r11
-; X64-NEXT:    andl $1, %r11d
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    setb %r14b
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    addq %r15, %rbx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    addq %r9, %rsi
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %r13, %rcx
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %r14, %r15
-; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r9, %rdi
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %rsi
+; X64-NEXT:    addq %r8, %r12
+; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %r11, %rbx
+; X64-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movzbl %r14b, %eax
+; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r11, %rcx
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rax, %r10
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %r10, %rdx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %r8
-; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    imulq %rbp, %rdi
+; X64-NEXT:    addq %rdx, %rdi
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %rbx, %rsi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; X64-NEXT:    imulq %r11, %rsi
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rbp, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    imulq %rcx, %rax
 ; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    adcq %r8, %rax
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq %rdi, %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r8
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq %r10, %rbp
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rbx, %r15
 ; X64-NEXT:    adcq %rsi, %rdi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    adcq %rcx, %r12
-; X64-NEXT:    addq %r11, %r9
-; X64-NEXT:    adcq %r14, %r12
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rdi, %r13
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %r12
+; X64-NEXT:    addq %r9, %r13
+; X64-NEXT:    adcq %r8, %r12
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
 ; X64-NEXT:    movq 120(%rdx), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload
 ; X64-NEXT:    imulq %r10, %rcx
@@ -5526,12 +5363,12 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    imulq %rbx, %rcx
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rdi, %rax
 ; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    addq %r11, %r13
+; X64-NEXT:    addq %r11, %r9
 ; X64-NEXT:    adcq %rsi, %rax
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq %rdi, %rax
@@ -5540,371 +5377,367 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdi
-; X64-NEXT:    adcq %rbp, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    addq %rbp, %rdi
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq %r13, %rax
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r9, %rax
 ; X64-NEXT:    adcq %r11, %rdx
 ; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    adcq %r15, %rdi
-; X64-NEXT:    adcq %r9, %rax
+; X64-NEXT:    adcq %r13, %rax
 ; X64-NEXT:    adcq %r12, %rdx
 ; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq 80(%rsi), %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq 80(%rsi), %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq 88(%rsi), %r10
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq 88(%rsi), %rax
+; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    addq %r8, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15 # 8-byte Reload
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rbp, %rsi
-; X64-NEXT:    sbbq %rdi, %rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rbx, %r14
+; X64-NEXT:    adcq %rbp, %rcx
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    adcq %rdi, %rbp
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    adcq %rax, %rbp
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
-; X64-NEXT:    addq %r9, %r10
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    adcq %r13, %r11
-; X64-NEXT:    addq %rbx, %r10
-; X64-NEXT:    adcq %rbp, %r11
-; X64-NEXT:    movq %r12, %rcx
-; X64-NEXT:    movq 64(%rcx), %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq 72(%rcx), %r14
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12 # 8-byte Reload
+; X64-NEXT:    addq %r12, %rsi
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8 # 8-byte Reload
+; X64-NEXT:    adcq %r8, %r10
+; X64-NEXT:    addq %rbx, %rsi
+; X64-NEXT:    adcq %rbp, %r10
+; X64-NEXT:    movq %r9, %rdi
+; X64-NEXT:    movq 64(%rdi), %r13
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq 72(%rdi), %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rbp, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rbp, %rcx
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rsi, %rbp
-; X64-NEXT:    adcq %rcx, %rbx
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r15, %r9
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    adcq %r8, %rax
-; X64-NEXT:    addq %rbp, %r9
-; X64-NEXT:    adcq %rbx, %rax
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    addq %r15, %rcx
+; X64-NEXT:    adcq %r11, %r8
+; X64-NEXT:    addq %rbp, %rcx
+; X64-NEXT:    adcq %rbx, %r8
+; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %r8
+; X64-NEXT:    movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, %r12
-; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r14
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rbp, %rdi
-; X64-NEXT:    adcq %rsi, %r9
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %rbp
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12 # 8-byte Reload
-; X64-NEXT:    addq %r12, %r15
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rbp, %rax
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    adcq %rdi, %rcx
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload
+; X64-NEXT:    addq %r14, %r15
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    adcq %r13, %r11
 ; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    adcq %rdx, %r8
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r13, %rdi
-; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rdx, %r11
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r12 # 8-byte Folded Reload
+; X64-NEXT:    movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT:    movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    addq %r10, %r15
-; X64-NEXT:    adcq %r11, %r8
-; X64-NEXT:    movl $0, %r9d
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    sbbq %r13, %r13
-; X64-NEXT:    andl $1, %r13d
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r11
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    addq %rsi, %r15
+; X64-NEXT:    adcq %r10, %r11
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %r8, %rdi
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r14, %rbx
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    adcq %rdi, %rcx
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movzbl %r8b, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    addq %r12, %rsi
+; X64-NEXT:    addq %r14, %rsi
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %rcx
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %r15, %r10
-; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r8, %rbx
+; X64-NEXT:    addq %r15, %r9
+; X64-NEXT:    movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %r11, %rbx
 ; X64-NEXT:    movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r9, %rsi
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r13, %rcx
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq 96(%rsi), %rcx
-; X64-NEXT:    imulq %rcx, %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq 96(%rbp), %rcx
+; X64-NEXT:    imulq %rcx, %rdi
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r11, %rdi
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rbp, %rdx
-; X64-NEXT:    movq 104(%rsi), %r8
-; X64-NEXT:    imulq %r8, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    movq %rdi, %r10
-; X64-NEXT:    movq 112(%rsi), %rax
-; X64-NEXT:    movq %rsi, %rbp
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    movq 104(%rbp), %r8
+; X64-NEXT:    imulq %r8, %rsi
+; X64-NEXT:    addq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    movq 112(%rbp), %rax
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    imulq %rbp, %rsi
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    movq 120(%rdi), %rdi
 ; X64-NEXT:    imulq %rbx, %rdi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 120(%rbp), %rdi
-; X64-NEXT:    imulq %rsi, %rdi
 ; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    adcq %r10, %rdi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    adcq %r11, %rdi
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r9
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, %r9
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rbx, %rbp
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rbx, %r15
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rbp, %r12
 ; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq %rcx, %rbx
-; X64-NEXT:    addq %r11, %r8
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rsi, %rbp
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    addq %r10, %rbp
 ; X64-NEXT:    adcq %rdi, %rbx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rax, %rsi
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12 # 8-byte Reload
-; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
+; X64-NEXT:    imulq %r11, %rcx
 ; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    movq %rcx, %rbp
+; X64-NEXT:    movq %rcx, %r9
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    imulq %rsi, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15 # 8-byte Reload
+; X64-NEXT:    imulq %r15, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq %r14, %r13
-; X64-NEXT:    imulq %rdi, %r13
-; X64-NEXT:    addq %rdx, %r13
-; X64-NEXT:    addq %r11, %r10
-; X64-NEXT:    adcq %rbp, %r13
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r14
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    imulq %r14, %rax
+; X64-NEXT:    addq %rdx, %rax
+; X64-NEXT:    addq %r8, %r10
+; X64-NEXT:    adcq %r9, %rax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rbp, %rcx
+; X64-NEXT:    addq %rdi, %rcx
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r12, %rbp
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    adcq %r9, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:    adcq %r9, %rdi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    adcq %r13, %rdx
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    adcq %r8, %rax
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r12, %rsi
+; X64-NEXT:    adcq %rbp, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
-; X64-NEXT:    addq (%rsp), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    addq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, %r9
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %r11 # 8-byte Folded Reload
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT:    adcq (%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, (%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 8(%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 16(%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 24(%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 32(%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 40(%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 48(%rcx)
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 56(%rcx)
-; X64-NEXT:    movq %r8, 64(%rcx)
-; X64-NEXT:    movq %r9, 72(%rcx)
-; X64-NEXT:    movq %rbx, 80(%rcx)
-; X64-NEXT:    movq %rbp, 88(%rcx)
-; X64-NEXT:    movq %rdi, 96(%rcx)
-; X64-NEXT:    movq %r11, 104(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, (%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 8(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 16(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 24(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 32(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 40(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 48(%rcx)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 56(%rcx)
+; X64-NEXT:    movq %r9, 64(%rcx)
+; X64-NEXT:    movq %r10, 72(%rcx)
+; X64-NEXT:    movq %rbp, 80(%rcx)
+; X64-NEXT:    movq %rbx, 88(%rcx)
+; X64-NEXT:    movq %r8, 96(%rcx)
+; X64-NEXT:    movq %rsi, 104(%rcx)
 ; X64-NEXT:    movq %rax, 112(%rcx)
 ; X64-NEXT:    movq %rdx, 120(%rcx)
 ; X64-NEXT:    addq $352, %rsp # imm = 0x160
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
index 341484718652..acd86e949894 100644
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -3,7 +3,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
 
 define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X32-LABEL: test:
@@ -138,18 +137,17 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -205,76 +203,70 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:  .Lcfi1:
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
 ; X64-NEXT:  .Lcfi2:
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    pushq %rbx
 ; X64-NEXT:  .Lcfi3:
-; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    .cfi_offset %rbx, -32
 ; X64-NEXT:  .Lcfi4:
-; X64-NEXT:    .cfi_offset %rbx, -40
-; X64-NEXT:  .Lcfi5:
-; X64-NEXT:    .cfi_offset %r12, -32
-; X64-NEXT:  .Lcfi6:
 ; X64-NEXT:    .cfi_offset %r14, -24
-; X64-NEXT:  .Lcfi7:
+; X64-NEXT:  .Lcfi5:
 ; X64-NEXT:    .cfi_offset %r15, -16
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq (%rdi), %r14
+; X64-NEXT:    movq (%rdi), %r11
 ; X64-NEXT:    movq 8(%rdi), %r8
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rbx
-; X64-NEXT:    movq (%rsi), %r12
+; X64-NEXT:    movq 16(%rdi), %rbx
+; X64-NEXT:    movq 16(%rsi), %r10
+; X64-NEXT:    movq (%rsi), %rcx
 ; X64-NEXT:    movq 8(%rsi), %r15
 ; X64-NEXT:    movq 24(%rdi), %rdi
-; X64-NEXT:    imulq %r12, %rdi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    imulq %r15, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    imulq %r15, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq %r10, %rdi
 ; X64-NEXT:    imulq %r8, %rdi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 24(%rsi), %rbx
-; X64-NEXT:    imulq %r14, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    adcq %rcx, %rbx
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq 24(%rsi), %rdi
+; X64-NEXT:    imulq %r11, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    addq %r14, %r10
+; X64-NEXT:    adcq %rbx, %rdi
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rdi, %r14
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rbx, %r11
 ; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:    sbbq %rcx, %rcx
-; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %ecx
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    movq %r10, (%r9)
-; X64-NEXT:    movq %r14, 8(%r9)
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    movq %r14, (%r9)
+; X64-NEXT:    movq %r11, 8(%r9)
 ; X64-NEXT:    movq %rax, 16(%r9)
 ; X64-NEXT:    movq %rdx, 24(%r9)
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
 ; X64-NEXT:    retq
@@ -286,4 +278,4 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #0 = { norecurse nounwind uwtable }
diff --git a/test/CodeGen/X86/mul-i512.ll b/test/CodeGen/X86/mul-i512.ll
index 14fbeae52796..3da17b69ffb5 100644
--- a/test/CodeGen/X86/mul-i512.ll
+++ b/test/CodeGen/X86/mul-i512.ll
@@ -74,14 +74,13 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl 20(%eax), %edi
 ; X32-NEXT:    movl 24(%eax), %ebx
 ; X32-NEXT:    movl 28(%eax), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -107,6 +106,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -123,8 +123,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
@@ -133,10 +132,11 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
@@ -145,25 +145,24 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
@@ -172,7 +171,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -183,14 +182,14 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
@@ -198,8 +197,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -213,7 +212,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
@@ -223,11 +222,11 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
@@ -240,20 +239,20 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
@@ -262,8 +261,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -274,21 +273,21 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
@@ -298,11 +297,11 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -313,8 +312,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -323,7 +322,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl (%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -349,10 +348,10 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -365,18 +364,18 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %esi
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __multi3
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    pushl $0
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
@@ -494,134 +493,142 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl $0, %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    sbbl %edx, %edx
-; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    setb %dl
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movzbl %dl, %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl %edx, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %edx, %ebx
+; X32-NEXT:    movl %ebx, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl %eax, %ebx
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    addl %eax, %ebx
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl %ecx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %edx, %eax
+; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl %eax, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %ebx, %eax
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    andl $1, %ebx
+; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl %eax, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
@@ -629,140 +636,125 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    adcl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %esi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl $0, %edx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    sbbl %esi, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movzbl %al, %ebx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    addl %eax, %edi
-; X32-NEXT:    adcl %ecx, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -777,7 +769,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
@@ -789,25 +781,24 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    adcl %ecx, %edi
-; X32-NEXT:    movl $0, %eax
-; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    sbbl %ecx, %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    setb %al
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
 ; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
@@ -828,8 +819,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -838,7 +829,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
@@ -853,7 +844,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
@@ -864,36 +855,36 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 16(%ebp), %edi
-; X32-NEXT:    movl %esi, 4(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl 16(%ebp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, (%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 8(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 12(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 16(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 20(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 24(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 28(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 32(%esi)
-; X32-NEXT:    movl %ebx, 36(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 40(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 44(%esi)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 48(%esi)
-; X32-NEXT:    movl %ecx, 52(%esi)
-; X32-NEXT:    movl %edx, 56(%esi)
-; X32-NEXT:    movl %eax, 60(%esi)
+; X32-NEXT:    movl %edi, 4(%esi)
+; X32-NEXT:    movl 16(%ebp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, (%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 8(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 12(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 16(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 20(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 24(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 28(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 32(%edi)
+; X32-NEXT:    movl %ebx, 36(%edi)
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 40(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 44(%edi)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 48(%edi)
+; X32-NEXT:    movl %ecx, 52(%edi)
+; X32-NEXT:    movl %edx, 56(%edi)
+; X32-NEXT:    movl %eax, 60(%edi)
 ; X32-NEXT:    leal -12(%ebp), %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
@@ -912,35 +903,36 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
 ; X64-NEXT:    movq 24(%rdi), %r11
-; X64-NEXT:    movq 16(%rdi), %r14
+; X64-NEXT:    movq 16(%rdi), %r15
 ; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq (%rsi), %rdx
 ; X64-NEXT:    movq 8(%rsi), %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rsi, %r10
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    addq %r9, %rsi
 ; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rsi, %r9
 ; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %ebx
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rbp, %r8
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
@@ -952,46 +944,44 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    addq %r10, %r15
-; X64-NEXT:    adcq %r13, %r12
+; X64-NEXT:    adcq %r13, %rdx
 ; X64-NEXT:    addq %rbp, %r15
-; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq (%rdi), %r14
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq 8(%rdi), %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r11, %rsi
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rbx
-; X64-NEXT:    sbbq %rdi, %rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rbx, %rbp
-; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    movq %r14, %rcx
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    mulq %rdx
@@ -1001,10 +991,11 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    adcq %r14, %r13
 ; X64-NEXT:    addq %rbp, %r10
 ; X64-NEXT:    adcq %rsi, %r13
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload
+; X64-NEXT:    addq %r8, %r10
 ; X64-NEXT:    adcq %r9, %r13
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    movq 16(%rsi), %r8
 ; X64-NEXT:    movq %rcx, %rax
@@ -1012,7 +1003,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r8
@@ -1027,14 +1018,14 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rsi
-; X64-NEXT:    sbbq %rbp, %rbp
-; X64-NEXT:    andl $1, %ebp
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq %rbp, %rbx
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    adcq %rax, %rbx
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    mulq %rcx
@@ -1044,16 +1035,14 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    adcq %rdx, %r14
 ; X64-NEXT:    addq %r9, %r11
 ; X64-NEXT:    adcq %rbx, %r14
-; X64-NEXT:    addq %r10, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    addq %r10, %r12
+; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %r13, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    adcq $0, %r14
 ; X64-NEXT:    addq %r15, %r11
-; X64-NEXT:    adcq %r12, %r14
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rcx, %r13
-; X64-NEXT:    sbbq %r9, %r9
-; X64-NEXT:    andl $1, %r9d
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
+; X64-NEXT:    setb %r9b
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r8
@@ -1072,12 +1061,12 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    addq %rbp, %rsi
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
@@ -1088,9 +1077,10 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %r14, %rbx
 ; X64-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r13, %rsi
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r9, %rcx
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq 32(%rcx), %rsi
@@ -1105,42 +1095,44 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    movq 48(%rcx), %rax
 ; X64-NEXT:    movq %rcx, %rbx
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
-; X64-NEXT:    imulq %r11, %rdi
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rdi, %rdx
 ; X64-NEXT:    movq 56(%rbx), %rbx
-; X64-NEXT:    imulq %rcx, %rbx
+; X64-NEXT:    imulq %rbp, %rbx
 ; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %r10, %r12
+; X64-NEXT:    addq %r10, %r14
 ; X64-NEXT:    adcq %r8, %rbx
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, %r10
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rdi, %rbp
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rbp, %r15
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rbp, %r13
 ; X64-NEXT:    adcq %rsi, %rdi
-; X64-NEXT:    sbbq %rsi, %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rdi, %r14
-; X64-NEXT:    adcq %rsi, %r11
-; X64-NEXT:    addq %r12, %r14
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rdi, %r9
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %r11
+; X64-NEXT:    addq %r14, %r9
 ; X64-NEXT:    adcq %rbx, %r11
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
 ; X64-NEXT:    movq 56(%rdx), %rcx
@@ -1152,49 +1144,50 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload
-; X64-NEXT:    imulq %r8, %rbx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload
+; X64-NEXT:    imulq %r15, %rbx
 ; X64-NEXT:    addq %rdx, %rbx
 ; X64-NEXT:    movq 32(%rbp), %rdi
-; X64-NEXT:    movq 40(%rbp), %r12
+; X64-NEXT:    movq 40(%rbp), %r8
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    imulq %r8, %rcx
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r13 # 8-byte Reload
-; X64-NEXT:    imulq %rdi, %r13
-; X64-NEXT:    addq %rdx, %r13
-; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq %rbx, %r13
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    imulq %rdi, %rax
+; X64-NEXT:    addq %rdx, %rax
+; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    adcq %rbx, %rax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r12, %rbx
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rbp, %rdi
-; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    sbbq %rbx, %rbx
-; X64-NEXT:    andl $1, %ebx
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    adcq %r13, %rdx
+; X64-NEXT:    addq %rbx, %rdi
+; X64-NEXT:    adcq %rcx, %rbp
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    addq %rbp, %rax
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq %r15, %rdi
-; X64-NEXT:    adcq %r14, %rax
+; X64-NEXT:    adcq %r13, %rdi
+; X64-NEXT:    adcq %r9, %rax
 ; X64-NEXT:    adcq %r11, %rdx
 ; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 0bda41a30c69..d26cf02dd942 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
 ; SSE2-LABEL: interleave_24i8_in:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
 ; SSE42:       # BB#0:
 ; SSE42-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE42-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5]
-; SSE42-NEXT:    movdqa %xmm1, %xmm3
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
+; SSE42-NEXT:    movdqa %xmm2, %xmm3
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
-; SSE42-NEXT:    por %xmm2, %xmm3
+; SSE42-NEXT:    por %xmm1, %xmm3
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    por %xmm0, %xmm1
-; SSE42-NEXT:    movq %xmm1, 16(%rdi)
+; SSE42-NEXT:    pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    por %xmm0, %xmm2
+; SSE42-NEXT:    movq %xmm2, 16(%rdi)
 ; SSE42-NEXT:    movdqu %xmm3, (%rdi)
 ; SSE42-NEXT:    retq
 ;
@@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
-; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
+; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmovq %xmm0, 16(%rdi)
-; AVX-NEXT:    vmovdqu %xmm2, (%rdi)
+; AVX-NEXT:    vmovdqu %xmm1, (%rdi)
 ; AVX-NEXT:    retq
   %s1 = load <8 x i8>, <8 x i8>* %q1, align 4
   %s2 = load <8 x i8>, <8 x i8>* %q2, align 4
diff --git a/test/CodeGen/X86/overflow.ll b/test/CodeGen/X86/overflow.ll
index ff25b5de4933..00dadc4a80f6 100644
--- a/test/CodeGen/X86/overflow.ll
+++ b/test/CodeGen/X86/overflow.ll
@@ -27,16 +27,14 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind {
 ; X32-NEXT:    addl $32, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    andl $1, %edi
-; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    sbbl %edx, %edx
-; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    setb %cl
+; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    movl %edi, (%esi)
 ; X32-NEXT:    movl %eax, 4(%esi)
 ; X32-NEXT:    movl %ecx, 8(%esi)
-; X32-NEXT:    movl %edx, 12(%esi)
+; X32-NEXT:    movl $0, 12(%esi)
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    leal -8(%ebp), %esp
 ; X32-NEXT:    popl %esi
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 50a661fcca11..88cb7a6d5825 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    pmuludq %xmm4, %xmm2
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
@@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    pmuludq %xmm3, %xmm0
+; SSE41-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
 ; SSE41-NEXT:    retq
 ;
@@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE2-NEXT:    pmuludq %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT:    pmuludq %xmm0, %xmm2
-; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE2-NEXT:    pmuludq %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    pmuludq %xmm7, %xmm5
+; SSE2-NEXT:    pmuludq %xmm0, %xmm2
+; SSE2-NEXT:    pmuludq %xmm8, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
 ; SSE2-NEXT:    movaps %xmm4, %xmm0
 ; SSE2-NEXT:    movaps %xmm5, %xmm1
@@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; SSE41-LABEL: mul_v8i64_zero_upper:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    pmuludq %xmm4, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
-; SSE41-NEXT:    pmuludq %xmm5, %xmm0
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    pmuludq %xmm6, %xmm2
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
 ; SSE41-NEXT:    pmuludq %xmm7, %xmm1
+; SSE41-NEXT:    pmuludq %xmm6, %xmm2
+; SSE41-NEXT:    pmuludq %xmm5, %xmm0
+; SSE41-NEXT:    pmuludq %xmm8, %xmm4
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
 ; SSE41-NEXT:    retq
 ;
@@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
 ;
@@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; SSE41-LABEL: mul_v8i64_sext:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
-; SSE41-NEXT:    pmovsxwq %xmm3, %xmm4
+; SSE41-NEXT:    pmovsxwq %xmm3, %xmm8
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovsxwq %xmm3, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
 ; SSE41-NEXT:    pmovsxwq %xmm3, %xmm6
-; SSE41-NEXT:    pmovsxwq %xmm0, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovsxwq %xmm3, %xmm7
+; SSE41-NEXT:    pmovsxwq %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE41-NEXT:    pmovsxdq %xmm0, %xmm3
-; SSE41-NEXT:    pmuldq %xmm4, %xmm3
 ; SSE41-NEXT:    pmovsxdq %xmm2, %xmm2
-; SSE41-NEXT:    pmuldq %xmm5, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovsxdq %xmm0, %xmm4
-; SSE41-NEXT:    pmuldq %xmm6, %xmm4
 ; SSE41-NEXT:    pmovsxdq %xmm1, %xmm0
-; SSE41-NEXT:    pmuldq %xmm7, %xmm0
+; SSE41-NEXT:    pmuldq %xmm5, %xmm0
+; SSE41-NEXT:    pmuldq %xmm7, %xmm4
+; SSE41-NEXT:    pmuldq %xmm6, %xmm2
+; SSE41-NEXT:    pmuldq %xmm8, %xmm3
 ; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    retq
 ;
@@ -1493,10 +1493,9 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
 ; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
-; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
 ; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa %ymm2, %ymm1
+; AVX2-NEXT:    vpmuldq %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_v8i64_sext:
diff --git a/test/CodeGen/X86/pr27591.ll b/test/CodeGen/X86/pr27591.ll
index 3ff6c096d097..b71cb8c4b3a2 100644
--- a/test/CodeGen/X86/pr27591.ll
+++ b/test/CodeGen/X86/pr27591.ll
@@ -9,12 +9,6 @@ define void @test1(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    # implicit-def: %EDI
-; CHECK-NEXT:    movb %al, %dil
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovd %edi, %k0
-; CHECK-NEXT:    kmovd %k0, %edi
-; CHECK-NEXT:    movb %dil, %al
 ; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movzbl %al, %edi
 ; CHECK-NEXT:    callq callee1
@@ -32,17 +26,9 @@ define void @test2(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    # implicit-def: %EDI
-; CHECK-NEXT:    movb %al, %dil
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovd %edi, %k0
-; CHECK-NEXT:    kmovd %k0, %edi
+; CHECK-NEXT:    movzbl %al, %edi
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    movb %dil, %al
-; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    movl $-1, %ecx
-; CHECK-NEXT:    cmovnel %ecx, %edi
+; CHECK-NEXT:    negl %edi
 ; CHECK-NEXT:    callq callee2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/pr28173.ll b/test/CodeGen/X86/pr28173.ll
index d9622b99bd98..3279982e4641 100644
--- a/test/CodeGen/X86/pr28173.ll
+++ b/test/CodeGen/X86/pr28173.ll
@@ -8,9 +8,8 @@ target triple = "x86_64-unknown-linux-gnu"
 define i64 @foo64(i1 zeroext %i) #0 {
 ; CHECK-LABEL: foo64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; CHECK-NEXT:    orq $-2, %rdi
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    orq $-2, %rax
 ; CHECK-NEXT:    retq
   br label %bb
 
@@ -26,8 +25,9 @@ end:
 define i16 @foo16(i1 zeroext %i) #0 {
 ; CHECK-LABEL: foo16:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    orl $65534, %edi # imm = 0xFFFE
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    orl $65534, %eax # imm = 0xFFFE
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   br label %bb
 
@@ -43,9 +43,9 @@ end:
 define i16 @foo16_1(i1 zeroext %i, i32 %j) #0 {
 ; CHECK-LABEL: foo16_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    orl $2, %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    orl $2, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   br label %bb
 
@@ -61,8 +61,8 @@ end:
 define i32 @foo32(i1 zeroext %i) #0 {
 ; CHECK-LABEL: foo32:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    orl $-2, %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    orl $-2, %eax
 ; CHECK-NEXT:    retq
   br label %bb
 
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index 94904018872b..8c970b3d4771 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -38,8 +38,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
-; CHECK-NEXT:    vaddps %xmm14, %xmm1, %xmm10
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]
@@ -53,9 +52,10 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
 ; CHECK-NEXT:    vmovaps %xmm15, %xmm1
 ; CHECK-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm9
+; CHECK-NEXT:    vaddps %xmm14, %xmm10, %xmm0
 ; CHECK-NEXT:    vaddps %xmm1, %xmm1, %xmm8
-; CHECK-NEXT:    vaddps %xmm11, %xmm3, %xmm0
-; CHECK-NEXT:    vaddps %xmm10, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm11, %xmm3, %xmm3
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps %xmm9, (%rsp)
diff --git a/test/CodeGen/X86/pr31088.ll b/test/CodeGen/X86/pr31088.ll
index d7a546c7396d..0dd8eb0ece85 100644
--- a/test/CodeGen/X86/pr31088.ll
+++ b/test/CodeGen/X86/pr31088.ll
@@ -150,12 +150,12 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; F16C-NEXT:    vcvtph2ps %xmm3, %xmm3
 ; F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT:    vaddss %xmm3, %xmm1, %xmm1
 ; F16C-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vaddss %xmm2, %xmm0, %xmm0
+; F16C-NEXT:    vaddss %xmm3, %xmm1, %xmm1
 ; F16C-NEXT:    retq
   %retval = fadd <2 x half> %arg0, %arg1
   ret <2 x half> %retval
diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll
index d8ce230057ea..e1f726f0c625 100644
--- a/test/CodeGen/X86/pr32241.ll
+++ b/test/CodeGen/X86/pr32241.ll
@@ -4,49 +4,57 @@
 define i32 @_Z3foov() {
 ; CHECK-LABEL: _Z3foov:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    subl $20, %esp
+; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:  .Lcfi0:
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $24, %esp
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:  .Lcfi2:
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    movw $10959, {{[0-9]+}}(%esp) # imm = 0x2ACF
 ; CHECK-NEXT:    movw $-15498, {{[0-9]+}}(%esp) # imm = 0xC376
 ; CHECK-NEXT:    movw $19417, {{[0-9]+}}(%esp) # imm = 0x4BD9
-; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %cx
-; CHECK-NEXT:    kxnorw %k0, %k0, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    testw %cx, %cx
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpw $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-NEXT:    movb %al, {{[0-9]+}}(%esp) # 1-byte Spill
 ; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:    jmp .LBB0_1
-; CHECK-NEXT:  .LBB0_1: # %lor.rhs
+; CHECK-NEXT:  # BB#1: # %lor.rhs
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    kmovd %eax, %k0
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill
 ; CHECK-NEXT:    jmp .LBB0_2
 ; CHECK-NEXT:  .LBB0_2: # %lor.end
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    kshiftrw $15, %k1, %k1
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
-; CHECK-NEXT:    kmovw %k1, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al # 1-byte Reload
+; CHECK-NEXT:    movb $1, %cl
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    movzbl %al, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; CHECK-NEXT:    subl %edx, %esi
+; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    movzbl %al, %edx
+; CHECK-NEXT:    xorl $-1, %edx
+; CHECK-NEXT:    cmpl $0, %edx
+; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill
 ; CHECK-NEXT:    jne .LBB0_4
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_3: # %lor.rhs4
+; CHECK-NEXT:  # BB#3: # %lor.rhs4
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    kmovd %eax, %k0
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill
 ; CHECK-NEXT:    jmp .LBB0_4
 ; CHECK-NEXT:  .LBB0_4: # %lor.end5
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    movw %ax, %cx
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    movw %cx, %dx
+; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    addl $24, %esp
+; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 entry:
   %aa = alloca i16, align 2
diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll
index cb26c13e53eb..e29b56236e26 100644
--- a/test/CodeGen/X86/pr32256.ll
+++ b/test/CodeGen/X86/pr32256.ll
@@ -7,39 +7,27 @@
 define void @_Z1av() {
 ; CHECK-LABEL: _Z1av:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    subl $6, %esp
+; CHECK-NEXT:    subl $2, %esp
 ; CHECK-NEXT:  .Lcfi0:
-; CHECK-NEXT:    .cfi_def_cfa_offset 10
+; CHECK-NEXT:    .cfi_def_cfa_offset 6
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    kmovd %eax, %k0
-; CHECK-NEXT:    movb c, %cl
-; CHECK-NEXT:    # implicit-def: %EAX
-; CHECK-NEXT:    movb %cl, %al
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    kmovq %k1, %k2
-; CHECK-NEXT:    kxnorw %k0, %k0, %k3
-; CHECK-NEXT:    kshiftrw $15, %k3, %k3
-; CHECK-NEXT:    kxorw %k3, %k1, %k1
-; CHECK-NEXT:    kmovd %k1, %eax
 ; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    kmovw %k2, {{[0-9]+}}(%esp) # 2-byte Spill
-; CHECK-NEXT:    kmovw %k0, (%esp) # 2-byte Spill
+; CHECK-NEXT:    movb c, %dl
+; CHECK-NEXT:    xorb $-1, %dl
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    movb %cl, (%esp) # 1-byte Spill
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:    jmp .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: # %land.rhs
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    kmovd %eax, %k0
-; CHECK-NEXT:    kmovw %k0, (%esp) # 2-byte Spill
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    movb %cl, (%esp) # 1-byte Spill
 ; CHECK-NEXT:    jmp .LBB0_2
 ; CHECK-NEXT:  .LBB0_2: # %land.end
-; CHECK-NEXT:    kmovw (%esp), %k0 # 2-byte Reload
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    movb %al, %cl
-; CHECK-NEXT:    andb $1, %cl
-; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    addl $6, %esp
+; CHECK-NEXT:    movb (%esp), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    addl $2, %esp
 ; CHECK-NEXT:    retl
 entry:
   %b = alloca i8, align 1
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 143e3af82eb7..571dd6774906 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -39,12 +39,6 @@ define void @foo() {
 ; X86-O0-NEXT:    movzbl %al, %edx
 ; X86-O0-NEXT:    subl %ecx, %edx
 ; X86-O0-NEXT:    setle %al
-; X86-O0-NEXT:    # implicit-def: %ECX
-; X86-O0-NEXT:    movb %al, %cl
-; X86-O0-NEXT:    andl $1, %ecx
-; X86-O0-NEXT:    kmovd %ecx, %k0
-; X86-O0-NEXT:    kmovd %k0, %ecx
-; X86-O0-NEXT:    movb %cl, %al
 ; X86-O0-NEXT:    andb $1, %al
 ; X86-O0-NEXT:    movzbl %al, %ecx
 ; X86-O0-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -77,12 +71,6 @@ define void @foo() {
 ; X64-O0-NEXT:    movzbl %al, %edx
 ; X64-O0-NEXT:    subl %ecx, %edx
 ; X64-O0-NEXT:    setle %al
-; X64-O0-NEXT:    # implicit-def: %ECX
-; X64-O0-NEXT:    movb %al, %cl
-; X64-O0-NEXT:    andl $1, %ecx
-; X64-O0-NEXT:    kmovd %ecx, %k0
-; X64-O0-NEXT:    kmovd %k0, %ecx
-; X64-O0-NEXT:    movb %cl, %al
 ; X64-O0-NEXT:    andb $1, %al
 ; X64-O0-NEXT:    movzbl %al, %ecx
 ; X64-O0-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll
index d980b7ff284c..e4643a863f94 100644
--- a/test/CodeGen/X86/pr32451.ll
+++ b/test/CodeGen/X86/pr32451.ll
@@ -25,12 +25,6 @@ define i8** @japi1_convert_690(i8**, i8***, i32) {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; CHECK-NEXT:    movl 4(%ecx), %edx
 ; CHECK-NEXT:    movb (%edx), %bl
-; CHECK-NEXT:    # implicit-def: %EDX
-; CHECK-NEXT:    movb %bl, %dl
-; CHECK-NEXT:    andl $1, %edx
-; CHECK-NEXT:    kmovw %edx, %k0
-; CHECK-NEXT:    kmovw %k0, %edx
-; CHECK-NEXT:    movb %dl, %bl
 ; CHECK-NEXT:    andb $1, %bl
 ; CHECK-NEXT:    movzbl %bl, %edx
 ; CHECK-NEXT:    movl %edx, (%esp)
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 4be3a4c2391b..5d5150ad62d6 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
 ; 32-NEXT:    movl %ebx, %esi
 ; 32-NEXT:    xorl %ebx, %ebx
 ; 32-NEXT:  .LBB0_4:
-; 32-NEXT:    orl %ebx, %edx
 ; 32-NEXT:    orl %esi, %eax
+; 32-NEXT:    orl %ebx, %edx
 ; 32-NEXT:    popl %esi
 ; 32-NEXT:    popl %edi
 ; 32-NEXT:    popl %ebx
@@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
 ; 32-NEXT:    movl %ebx, %esi
 ; 32-NEXT:    xorl %ebx, %ebx
 ; 32-NEXT:  .LBB1_4:
-; 32-NEXT:    orl %esi, %edx
 ; 32-NEXT:    orl %ebx, %eax
+; 32-NEXT:    orl %esi, %edx
 ; 32-NEXT:    popl %esi
 ; 32-NEXT:    popl %edi
 ; 32-NEXT:    popl %ebx
@@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
 ; 32-LABEL: rotr1_64_mem:
 ; 32:       # BB#0:
 ; 32-NEXT:    pushl %esi
-; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-NEXT:    movl 8(%esp), %eax
 ; 32-NEXT:    movl (%eax), %ecx
 ; 32-NEXT:    movl 4(%eax), %edx
 ; 32-NEXT:    movl %edx, %esi
@@ -555,13 +555,11 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
 ; 32-NEXT:    movl %ecx, 4(%eax)
 ; 32-NEXT:    movl %esi, (%eax)
 ; 32-NEXT:    popl %esi
-; 32-NEXT:    retl
-;
+
 ; 64-LABEL: rotr1_64_mem:
 ; 64:       # BB#0:
 ; 64-NEXT:    rorq (%rdi)
 ; 64-NEXT:    retq
-
   %A = load i64, i64 *%Aptr
   %B = shl i64 %A, 63
   %C = lshr i64 %A, 1
@@ -573,7 +571,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
 define void @rotr1_32_mem(i32* %Aptr) nounwind {
 ; 32-LABEL: rotr1_32_mem:
 ; 32:       # BB#0:
-; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-NEXT:    movl 4(%esp), %eax
 ; 32-NEXT:    rorl (%eax)
 ; 32-NEXT:    retl
 ;
@@ -592,7 +590,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind {
 define void @rotr1_16_mem(i16* %Aptr) nounwind {
 ; 32-LABEL: rotr1_16_mem:
 ; 32:       # BB#0:
-; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-NEXT:    movl 4(%esp), %eax
 ; 32-NEXT:    rorw (%eax)
 ; 32-NEXT:    retl
 ;
@@ -611,7 +609,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind {
 define void @rotr1_8_mem(i8* %Aptr) nounwind {
 ; 32-LABEL: rotr1_8_mem:
 ; 32:       # BB#0:
-; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; 32-NEXT:    movl 4(%esp), %eax
 ; 32-NEXT:    rorb (%eax)
 ; 32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index 7215c482ffa2..a8562677c7bf 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+rtm | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+rtm | FileCheck %s --check-prefix=X64
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-unknown-unknown -mattr=+rtm | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown -mattr=+rtm | FileCheck %s --check-prefix=X64
 
 declare i32 @llvm.x86.xbegin() nounwind
 declare void @llvm.x86.xend() nounwind
@@ -13,7 +13,8 @@ define i32 @test_xbegin() nounwind uwtable {
 ; X86-NEXT:    xbegin .LBB0_2
 ; X86-NEXT:  # BB#1: # %entry
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:  .LBB0_2: # %entry
+; X86:       .LBB0_2: # %entry
+; X86-NEXT:  # XABORT DEF
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_xbegin:
@@ -21,7 +22,8 @@ define i32 @test_xbegin() nounwind uwtable {
 ; X64-NEXT:    xbegin .LBB0_2
 ; X64-NEXT:  # BB#1: # %entry
 ; X64-NEXT:    movl $-1, %eax
-; X64-NEXT:  .LBB0_2: # %entry
+; X64:       .LBB0_2: # %entry
+; X64-NEXT:  # XABORT DEF
 ; X64-NEXT:    retq
 entry:
   %0 = tail call i32 @llvm.x86.xbegin() nounwind
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index 6a565a5c76f0..b8a8b8afd14f 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -149,131 +149,127 @@ middle.block:
 define i32 @sad_32i8() nounwind {
 ; SSE2-LABEL: sad_32i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm11
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    pxor %xmm13, %xmm13
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm14, %xmm14
-; SSE2-NEXT:    pxor %xmm15, %xmm15
+; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pxor %xmm15, %xmm15
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm8
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm6
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
-; SSE2-NEXT:    movdqa %xmm11, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
 ; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm10, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
-; SSE2-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm6
+; SSE2-NEXT:    movdqa b+1024(%rax), %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm10, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
 ; SSE2-NEXT:    psubd %xmm2, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm11, %xmm3
-; SSE2-NEXT:    movdqa %xmm6, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
+; SSE2-NEXT:    psubd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm4, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm2, %xmm10
 ; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm6, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
-; SSE2-NEXT:    movdqa %xmm9, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm6, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm8
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    pxor %xmm6, %xmm7
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm6, %xmm14
 ; SSE2-NEXT:    paddd %xmm7, %xmm13
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    psrad $31, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm15
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm12
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm14
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm8, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    pxor %xmm0, %xmm8
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    paddd %xmm10, %xmm0
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # BB#2: # %middle.block
-; SSE2-NEXT:    paddd %xmm15, %xmm6
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    paddd %xmm14, %xmm13
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    paddd %xmm3, %xmm4
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm15, %xmm3
+; SSE2-NEXT:    paddd %xmm14, %xmm1
+; SSE2-NEXT:    paddd %xmm12, %xmm0
+; SSE2-NEXT:    paddd %xmm13, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -402,284 +398,288 @@ middle.block:
 define i32 @sad_avx64i8() nounwind {
 ; SSE2-LABEL: sad_avx64i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    subq $200, %rsp
-; SSE2-NEXT:    pxor %xmm14, %xmm14
-; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT:    subq $184, %rsp
 ; SSE2-NEXT:    pxor %xmm15, %xmm15
-; SSE2-NEXT:    pxor %xmm10, %xmm10
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm13, %xmm13
-; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT:    pxor %xmm12, %xmm12
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm5
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm11, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movaps a+1040(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm15
-; SSE2-NEXT:    movdqa a+1072(%rax), %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm15, %xmm11
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm11, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm12, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
+; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm6
+; SSE2-NEXT:    movdqa a+1024(%rax), %xmm4
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm11
+; SSE2-NEXT:    movdqa a+1072(%rax), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
 ; SSE2-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
-; SSE2-NEXT:    movdqa b+1072(%rax), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa b+1056(%rax), %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm8
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm11
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm15
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
+; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm6
+; SSE2-NEXT:    movdqa b+1024(%rax), %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm13, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm13, %xmm14
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm7
 ; SSE2-NEXT:    movdqa %xmm2, %xmm9
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm0, %xmm10
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm9, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm12
-; SSE2-NEXT:    movdqa b+1040(%rax), %xmm13
-; SSE2-NEXT:    movdqa %xmm13, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm3, %xmm9
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15]
-; SSE2-NEXT:    movdqa %xmm13, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
-; SSE2-NEXT:    psubd %xmm13, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm11
-; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa (%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm11
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm15, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm15
-; SSE2-NEXT:    pxor %xmm1, %xmm15
-; SSE2-NEXT:    paddd %xmm15, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm15
-; SSE2-NEXT:    movdqa %xmm10, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm10
-; SSE2-NEXT:    pxor %xmm1, %xmm10
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm6
-; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    movdqa %xmm12, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm12
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    movdqa b+1056(%rax), %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm10, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm10, %xmm12
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm13
+; SSE2-NEXT:    movdqa b+1072(%rax), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm9
 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    paddd %xmm9, %xmm1
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm10, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm13, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    pxor %xmm0, %xmm13
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm12, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm4
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    movdqa %xmm14, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm14
+; SSE2-NEXT:    pxor %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm6
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm0
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm6
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm14, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm7, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm12, %xmm8
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm12
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm11 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    paddd %xmm13, %xmm7
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm10, %xmm1
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # BB#2: # %middle.block
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm3, %xmm8
-; SSE2-NEXT:    paddd %xmm2, %xmm15
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm13
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm0
-; SSE2-NEXT:    paddd %xmm11, %xmm10
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    paddd %xmm13, %xmm1
-; SSE2-NEXT:    paddd %xmm15, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    paddd %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm12, %xmm2
+; SSE2-NEXT:    paddd %xmm11, %xmm2
+; SSE2-NEXT:    paddd %xmm13, %xmm14
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm7
+; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm7
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm14, %xmm6
+; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    paddd %xmm8, %xmm7
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    paddd %xmm2, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm7, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    addq $200, %rsp
+; SSE2-NEXT:    addq $184, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: sad_avx64i8:
@@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
 ; AVX2-NEXT:    vpxor %ymm6, %ymm6, %ymm6
 ; AVX2-NEXT:    vpxor %ymm5, %ymm5, %ymm5
 ; AVX2-NEXT:    vpxor %ymm7, %ymm7, %ymm7
@@ -697,6 +697,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:  .LBB2_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -704,49 +705,48 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpsubd %ymm8, %ymm15, %ymm8
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm8
-; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
+; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
+; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm11, %ymm11
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm12, %ymm12
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT:    vpsubd %ymm15, %ymm10, %ymm10
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
+; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
+; AVX2-NEXT:    vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
-; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm15
-; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpabsd %ymm9, %ymm8
-; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
-; AVX2-NEXT:    vpabsd %ymm10, %ymm8
-; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
-; AVX2-NEXT:    vpabsd %ymm11, %ymm8
+; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
+; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm15
+; AVX2-NEXT:    vpabsd %ymm8, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm8, %ymm3
-; AVX2-NEXT:    vpabsd %ymm12, %ymm8
-; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpabsd %ymm13, %ymm8
-; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
 ; AVX2-NEXT:    vpabsd %ymm14, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT:    vpabsd %ymm15, %ymm8
+; AVX2-NEXT:    vpabsd %ymm13, %ymm8
+; AVX2-NEXT:    vpaddd %ymm2, %ymm8, %ymm2
+; AVX2-NEXT:    vpabsd %ymm12, %ymm8
+; AVX2-NEXT:    vpaddd %ymm0, %ymm8, %ymm0
+; AVX2-NEXT:    vpabsd %ymm11, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm4, %ymm8, %ymm4
+; AVX2-NEXT:    vpabsd %ymm10, %ymm8
+; AVX2-NEXT:    vpaddd %ymm6, %ymm8, %ymm6
+; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT:    vpabsd %ymm15, %ymm8
+; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    addq $4, %rax
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # BB#2: # %middle.block
 ; AVX2-NEXT:    vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
 ; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT:    vpsubd %zmm11, %zmm7, %zmm7
+; AVX512F-NEXT:    vpsubd %zmm10, %zmm6, %zmm6
+; AVX512F-NEXT:    vpsubd %zmm9, %zmm5, %zmm5
 ; AVX512F-NEXT:    vpsubd %zmm8, %zmm4, %zmm4
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm5, %zmm5
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm6, %zmm6
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT:    vpsubd %zmm8, %zmm7, %zmm7
 ; AVX512F-NEXT:    vpabsd %zmm4, %zmm4
+; AVX512F-NEXT:    vpabsd %zmm5, %zmm5
+; AVX512F-NEXT:    vpabsd %zmm6, %zmm6
+; AVX512F-NEXT:    vpabsd %zmm7, %zmm7
+; AVX512F-NEXT:    vpaddd %zmm3, %zmm7, %zmm3
+; AVX512F-NEXT:    vpaddd %zmm2, %zmm6, %zmm2
+; AVX512F-NEXT:    vpaddd %zmm1, %zmm5, %zmm1
 ; AVX512F-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
-; AVX512F-NEXT:    vpabsd %zmm5, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm4, %zmm1
-; AVX512F-NEXT:    vpabsd %zmm6, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm2, %zmm4, %zmm2
-; AVX512F-NEXT:    vpabsd %zmm7, %zmm4
-; AVX512F-NEXT:    vpaddd %zmm3, %zmm4, %zmm3
 ; AVX512F-NEXT:    addq $4, %rax
 ; AVX512F-NEXT:    jne .LBB2_1
 ; AVX512F-NEXT:  # BB#2: # %middle.block
@@ -1154,54 +1154,59 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; SSE2-LABEL: sad_nonloop_32i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm12
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm12, %xmm8
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm12, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movdqu (%rdx), %xmm7
-; SSE2-NEXT:    movdqu 16(%rdx), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm10
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm11
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm13
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm4
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm6, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm3, %xmm12
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm0
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqu (%rdx), %xmm5
+; SSE2-NEXT:    movdqu 16(%rdx), %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    psubd %xmm5, %xmm0
+; SSE2-NEXT:    psubd %xmm7, %xmm3
+; SSE2-NEXT:    psubd %xmm2, %xmm13
+; SSE2-NEXT:    psubd %xmm1, %xmm12
+; SSE2-NEXT:    psubd %xmm8, %xmm6
+; SSE2-NEXT:    psubd %xmm15, %xmm11
+; SSE2-NEXT:    psubd %xmm14, %xmm10
+; SSE2-NEXT:    psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm9
+; SSE2-NEXT:    pxor %xmm1, %xmm9
 ; SSE2-NEXT:    movdqa %xmm10, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm10
@@ -1210,37 +1215,33 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm11
 ; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa %xmm13, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm13
-; SSE2-NEXT:    pxor %xmm1, %xmm13
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    paddd %xmm13, %xmm4
-; SSE2-NEXT:    paddd %xmm10, %xmm4
-; SSE2-NEXT:    paddd %xmm11, %xmm4
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    paddd %xmm1, %xmm9
-; SSE2-NEXT:    pxor %xmm1, %xmm9
+; SSE2-NEXT:    paddd %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm1, %xmm6
 ; SSE2-NEXT:    movdqa %xmm12, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm12
 ; SSE2-NEXT:    pxor %xmm1, %xmm12
+; SSE2-NEXT:    movdqa %xmm13, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm13
+; SSE2-NEXT:    pxor %xmm1, %xmm13
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm11, %xmm6
+; SSE2-NEXT:    paddd %xmm9, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm6
 ; SSE2-NEXT:    paddd %xmm12, %xmm0
-; SSE2-NEXT:    paddd %xmm8, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm9, %xmm0
+; SSE2-NEXT:    paddd %xmm6, %xmm0
+; SSE2-NEXT:    paddd %xmm13, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 1afef86a5f11..ce42d0d643e8 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -299,21 +299,20 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; GENERIC-NEXT:    testb %dil, %dil
 ; GENERIC-NEXT:    jne LBB7_4
 ; GENERIC-NEXT:  ## BB#5:
-; GENERIC-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; GENERIC-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; GENERIC-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; GENERIC-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; GENERIC-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; GENERIC-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; GENERIC-NEXT:    jmp LBB7_6
 ; GENERIC-NEXT:  LBB7_4:
-; GENERIC-NEXT:    movd %r9d, %xmm1
-; GENERIC-NEXT:    movd %ecx, %xmm2
-; GENERIC-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; GENERIC-NEXT:    movd %r8d, %xmm3
+; GENERIC-NEXT:    movd %r9d, %xmm2
+; GENERIC-NEXT:    movd %ecx, %xmm3
+; GENERIC-NEXT:    movd %r8d, %xmm4
 ; GENERIC-NEXT:    movd %edx, %xmm1
 ; GENERIC-NEXT:  LBB7_6:
+; GENERIC-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; GENERIC-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; GENERIC-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; GENERIC-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; GENERIC-NEXT:    psubd {{.*}}(%rip), %xmm1
 ; GENERIC-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; GENERIC-NEXT:    movq %xmm0, 16(%rsi)
@@ -340,19 +339,16 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; ATOM-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; ATOM-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; ATOM-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; ATOM-NEXT:    jmp LBB7_6
 ; ATOM-NEXT:  LBB7_4:
-; ATOM-NEXT:    movd %r9d, %xmm1
-; ATOM-NEXT:    movd %ecx, %xmm2
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; ATOM-NEXT:    movd %r8d, %xmm3
+; ATOM-NEXT:    movd %r9d, %xmm2
+; ATOM-NEXT:    movd %ecx, %xmm3
+; ATOM-NEXT:    movd %r8d, %xmm4
 ; ATOM-NEXT:    movd %edx, %xmm1
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; ATOM-NEXT:  LBB7_6:
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; ATOM-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; ATOM-NEXT:    psubd {{.*}}(%rip), %xmm0
 ; ATOM-NEXT:    psubd {{.*}}(%rip), %xmm1
 ; ATOM-NEXT:    movq %xmm0, 16(%rsi)
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index 332bf2887fb0..2996edaec3e0 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
 ; SSE2-LABEL: ne_i256:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    movq %xmm4, %r8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %rcx
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %xmm1, %r8
+; SSE2-NEXT:    movq %xmm4, %r9
+; SSE2-NEXT:    movq %xmm0, %r10
+; SSE2-NEXT:    movq %xmm1, %rsi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    xorq %rax, %rdi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rsi
-; SSE2-NEXT:    xorq %rcx, %rsi
-; SSE2-NEXT:    orq %rdi, %rsi
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorq %rdx, %rax
-; SSE2-NEXT:    movq %xmm3, %rcx
-; SSE2-NEXT:    xorq %r8, %rcx
-; SSE2-NEXT:    orq %rax, %rcx
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    movq %xmm3, %rdx
+; SSE2-NEXT:    xorq %rsi, %rdx
+; SSE2-NEXT:    xorq %r10, %rcx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    xorq %r9, %rax
+; SSE2-NEXT:    xorq %r8, %rdi
+; SSE2-NEXT:    orq %rax, %rdi
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rsi, %rcx
+; SSE2-NEXT:    orq %rcx, %rdi
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
@@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
 ; SSE2-LABEL: eq_i256:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    movq %xmm4, %r8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm4, %rcx
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %xmm1, %r8
+; SSE2-NEXT:    movq %xmm4, %r9
+; SSE2-NEXT:    movq %xmm0, %r10
+; SSE2-NEXT:    movq %xmm1, %rsi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    xorq %rax, %rdi
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rsi
-; SSE2-NEXT:    xorq %rcx, %rsi
-; SSE2-NEXT:    orq %rdi, %rsi
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorq %rdx, %rax
-; SSE2-NEXT:    movq %xmm3, %rcx
-; SSE2-NEXT:    xorq %r8, %rcx
-; SSE2-NEXT:    orq %rax, %rcx
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    movq %xmm3, %rdx
+; SSE2-NEXT:    xorq %rsi, %rdx
+; SSE2-NEXT:    xorq %r10, %rcx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    xorq %r9, %rax
+; SSE2-NEXT:    xorq %r8, %rdi
+; SSE2-NEXT:    orq %rax, %rdi
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    orq %rsi, %rcx
+; SSE2-NEXT:    orq %rcx, %rdi
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll
index 6701c247e6fc..c869dff9e642 100644
--- a/test/CodeGen/X86/shrink_vmul_sse.ll
+++ b/test/CodeGen/X86/shrink_vmul_sse.ll
@@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; CHECK-NEXT:    movzbl 1(%edx,%ecx), %edi
 ; CHECK-NEXT:    movzbl (%edx,%ecx), %edx
 ; CHECK-NEXT:    movzbl 1(%eax,%ecx), %ebx
-; CHECK-NEXT:    imull %edi, %ebx
 ; CHECK-NEXT:    movzbl (%eax,%ecx), %eax
 ; CHECK-NEXT:    imull %edx, %eax
+; CHECK-NEXT:    imull %edi, %ebx
 ; CHECK-NEXT:    movl %ebx, 4(%esi,%ecx,4)
 ; CHECK-NEXT:    movl %eax, (%esi,%ecx,4)
 ; CHECK-NEXT:    popl %esi
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index d99cfaf535de..0b03dffe99b5 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1537,9 +1537,9 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X32-NEXT:    retl
 ;
@@ -1673,13 +1673,13 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
 define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
 ; X32-LABEL: test_mm_setr_ps:
 ; X32:       # BB#0:
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_setr_ps:
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
index f711dc615742..4b2af6fce8de 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1119,9 +1119,9 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ;
 ; AVX512-LABEL: add_ss_mask:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    andl $1, %edi
+; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm2, %xmm0
 ; AVX512-NEXT:    retq
   %1 = extractelement <4 x float> %a, i64 0
@@ -1174,9 +1174,9 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ;
 ; AVX512-LABEL: add_sd_mask:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    andl $1, %edi
+; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovapd %xmm2, %xmm0
 ; AVX512-NEXT:    retq
   %1 = extractelement <2 x double> %a, i64 0
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 68ab3f9f3205..f4964b5a6f66 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -66,8 +66,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    jne .LBB1_8
 ; X32-NEXT:  .LBB1_7:
 ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X32-NEXT:    je .LBB1_10
 ; X32-NEXT:    jmp .LBB1_11
 ; X32-NEXT:  .LBB1_1:
@@ -80,8 +80,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    je .LBB1_7
 ; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X32-NEXT:    jne .LBB1_11
 ; X32-NEXT:  .LBB1_10:
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -105,8 +105,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    jne .LBB1_8
 ; X64-NEXT:  .LBB1_7:
 ; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    je .LBB1_10
 ; X64-NEXT:    jmp .LBB1_11
 ; X64-NEXT:  .LBB1_1:
@@ -119,8 +119,8 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    je .LBB1_7
 ; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
-; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    jne .LBB1_11
 ; X64-NEXT:  .LBB1_10:
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index aed5e0d1c32e..4d895ea264c5 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -412,14 +412,14 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-NEXT:    subss %xmm4, %xmm3
-; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE-NEXT:    addss %xmm0, %xmm3
+; SSE-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    addss %xmm0, %xmm4
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; SSE-NEXT:    addss %xmm0, %xmm1
-; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -431,12 +431,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
 ; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 ; AVX-NEXT:    vaddss %xmm0, %xmm4, %xmm4
-; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %A, i32 0
   %2 = extractelement <4 x float> %B, i32 0
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 4a0dc9c1eb17..503b9416c8d3 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
 ; X32:       ## BB#0: ## %entry
 ; X32-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X32-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X32-NEXT:    addss %xmm2, %xmm3
 ; X32-NEXT:    addss %xmm1, %xmm0
+; X32-NEXT:    addss %xmm2, %xmm3
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
 ; X32-NEXT:    retl
 ;
@@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
 ; X64:       ## BB#0: ## %entry
 ; X64-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X64-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X64-NEXT:    addss %xmm2, %xmm3
 ; X64-NEXT:    addss %xmm1, %xmm0
+; X64-NEXT:    addss %xmm2, %xmm3
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
 ; X64-NEXT:    retq
 entry:
@@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X32-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 ; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
 ; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    addps %xmm2, %xmm3
 ; X32-NEXT:    addps %xmm3, %xmm0
 ; X32-NEXT:    retl
@@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 ; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; X64-NEXT:    addps %xmm1, %xmm0
 ; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
 ; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT:    addps %xmm1, %xmm0
 ; X64-NEXT:    addps %xmm2, %xmm3
 ; X64-NEXT:    addps %xmm3, %xmm0
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/subcarry.ll b/test/CodeGen/X86/subcarry.ll
new file mode 100644
index 000000000000..df676328f682
--- /dev/null
+++ b/test/CodeGen/X86/subcarry.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+
+%S = type { [4 x i64] }
+
+define %S @negate(%S* nocapture readonly %this) {
+; CHECK-LABEL: negate:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movq (%rsi), %rax
+; CHECK-NEXT:    movq 8(%rsi), %rcx
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    addq $1, %rax
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    adcq $0, %rcx
+; CHECK-NEXT:    movq 16(%rsi), %rdx
+; CHECK-NEXT:    notq %rdx
+; CHECK-NEXT:    adcq $0, %rdx
+; CHECK-NEXT:    movq 24(%rsi), %rsi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    adcq $0, %rsi
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    movq %rcx, 8(%rdi)
+; CHECK-NEXT:    movq %rdx, 16(%rdi)
+; CHECK-NEXT:    movq %rsi, 24(%rdi)
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 0
+  %1 = load i64, i64* %0, align 8
+  %2 = xor i64 %1, -1
+  %3 = zext i64 %2 to i128
+  %4 = add nuw nsw i128 %3, 1
+  %5 = trunc i128 %4 to i64
+  %6 = lshr i128 %4, 64
+  %7 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 1
+  %8 = load i64, i64* %7, align 8
+  %9 = xor i64 %8, -1
+  %10 = zext i64 %9 to i128
+  %11 = add nuw nsw i128 %6, %10
+  %12 = trunc i128 %11 to i64
+  %13 = lshr i128 %11, 64
+  %14 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 2
+  %15 = load i64, i64* %14, align 8
+  %16 = xor i64 %15, -1
+  %17 = zext i64 %16 to i128
+  %18 = add nuw nsw i128 %13, %17
+  %19 = lshr i128 %18, 64
+  %20 = trunc i128 %18 to i64
+  %21 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 3
+  %22 = load i64, i64* %21, align 8
+  %23 = xor i64 %22, -1
+  %24 = zext i64 %23 to i128
+  %25 = add nuw nsw i128 %19, %24
+  %26 = trunc i128 %25 to i64
+  %27 = insertvalue [4 x i64] undef, i64 %5, 0
+  %28 = insertvalue [4 x i64] %27, i64 %12, 1
+  %29 = insertvalue [4 x i64] %28, i64 %20, 2
+  %30 = insertvalue [4 x i64] %29, i64 %26, 3
+  %31 = insertvalue %S undef, [4 x i64] %30, 0
+  ret %S %31
+}
+
+define %S @sub(%S* nocapture readonly %this, %S %arg.b) local_unnamed_addr {
+; CHECK-LABEL: sub:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    notq %rdx
+; CHECK-NEXT:    xorl %r10d, %r10d
+; CHECK-NEXT:    addq (%rsi), %rdx
+; CHECK-NEXT:    setb %r10b
+; CHECK-NEXT:    addq $1, %rdx
+; CHECK-NEXT:    adcq 8(%rsi), %r10
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    movzbl %al, %r11d
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    addq %r10, %rcx
+; CHECK-NEXT:    adcq 16(%rsi), %r11
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    notq %r8
+; CHECK-NEXT:    addq %r11, %r8
+; CHECK-NEXT:    adcq 24(%rsi), %rax
+; CHECK-NEXT:    notq %r9
+; CHECK-NEXT:    addq %rax, %r9
+; CHECK-NEXT:    movq %rdx, (%rdi)
+; CHECK-NEXT:    movq %rcx, 8(%rdi)
+; CHECK-NEXT:    movq %r8, 16(%rdi)
+; CHECK-NEXT:    movq %r9, 24(%rdi)
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = extractvalue %S %arg.b, 0
+  %.elt6 = extractvalue [4 x i64] %0, 1
+  %.elt8 = extractvalue [4 x i64] %0, 2
+  %.elt10 = extractvalue [4 x i64] %0, 3
+  %.elt = extractvalue [4 x i64] %0, 0
+  %1 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 0
+  %2 = load i64, i64* %1, align 8
+  %3 = zext i64 %2 to i128
+  %4 = add nuw nsw i128 %3, 1
+  %5 = xor i64 %.elt, -1
+  %6 = zext i64 %5 to i128
+  %7 = add nuw nsw i128 %4, %6
+  %8 = trunc i128 %7 to i64
+  %9 = lshr i128 %7, 64
+  %10 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 1
+  %11 = load i64, i64* %10, align 8
+  %12 = zext i64 %11 to i128
+  %13 = add nuw nsw i128 %9, %12
+  %14 = xor i64 %.elt6, -1
+  %15 = zext i64 %14 to i128
+  %16 = add nuw nsw i128 %13, %15
+  %17 = trunc i128 %16 to i64
+  %18 = lshr i128 %16, 64
+  %19 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 2
+  %20 = load i64, i64* %19, align 8
+  %21 = zext i64 %20 to i128
+  %22 = add nuw nsw i128 %18, %21
+  %23 = xor i64 %.elt8, -1
+  %24 = zext i64 %23 to i128
+  %25 = add nuw nsw i128 %22, %24
+  %26 = lshr i128 %25, 64
+  %27 = trunc i128 %25 to i64
+  %28 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 3
+  %29 = load i64, i64* %28, align 8
+  %30 = zext i64 %29 to i128
+  %31 = add nuw nsw i128 %26, %30
+  %32 = xor i64 %.elt10, -1
+  %33 = zext i64 %32 to i128
+  %34 = add nuw nsw i128 %31, %33
+  %35 = trunc i128 %34 to i64
+  %36 = insertvalue [4 x i64] undef, i64 %8, 0
+  %37 = insertvalue [4 x i64] %36, i64 %17, 1
+  %38 = insertvalue [4 x i64] %37, i64 %27, 2
+  %39 = insertvalue [4 x i64] %38, i64 %35, 3
+  %40 = insertvalue %S undef, [4 x i64] %39, 0
+  ret %S %40
+}
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 1eef67764ab9..a42b3c96c3ae 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4344,7 +4344,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_4
 ; AVX1-NEXT:  # BB#5:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm4
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
 ; AVX1-NEXT:    jmp .LBB80_6
 ; AVX1-NEXT:  .LBB80_4:
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -4352,22 +4352,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm4
+; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:  .LBB80_6:
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vmovq %xmm2, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_7
 ; AVX1-NEXT:  # BB#8:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
 ; AVX1-NEXT:    jmp .LBB80_9
 ; AVX1-NEXT:  .LBB80_7:
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:  .LBB80_9:
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX1-NEXT:    testq %rax, %rax
@@ -4397,29 +4397,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
 ; AVX1-NEXT:    vaddss %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:  .LBB80_15:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_16
 ; AVX1-NEXT:  # BB#17:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
 ; AVX1-NEXT:    jmp .LBB80_18
 ; AVX1-NEXT:  .LBB80_16:
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
-; AVX1-NEXT:    vaddss %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX1-NEXT:    vaddss %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:  .LBB80_18:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vmovq %xmm4, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_19
 ; AVX1-NEXT:  # BB#20:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm5
 ; AVX1-NEXT:    jmp .LBB80_21
 ; AVX1-NEXT:  .LBB80_19:
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -4427,25 +4427,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
 ; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm5
 ; AVX1-NEXT:  .LBB80_21:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    js .LBB80_22
 ; AVX1-NEXT:  # BB#23:
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
 ; AVX1-NEXT:    jmp .LBB80_24
 ; AVX1-NEXT:  .LBB80_22:
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    andl $1, %eax
 ; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
-; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:  .LBB80_24:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -4471,7 +4471,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_4
 ; AVX2-NEXT:  # BB#5:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm4
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
 ; AVX2-NEXT:    jmp .LBB80_6
 ; AVX2-NEXT:  .LBB80_4:
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -4479,22 +4479,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm4
+; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:  .LBB80_6:
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vmovq %xmm2, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_7
 ; AVX2-NEXT:  # BB#8:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
 ; AVX2-NEXT:    jmp .LBB80_9
 ; AVX2-NEXT:  .LBB80_7:
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shrq %rcx
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm3
-; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
 ; AVX2-NEXT:  .LBB80_9:
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
 ; AVX2-NEXT:    testq %rax, %rax
@@ -4524,29 +4524,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm5, %xmm5
 ; AVX2-NEXT:    vaddss %xmm5, %xmm5, %xmm5
 ; AVX2-NEXT:  .LBB80_15:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_16
 ; AVX2-NEXT:  # BB#17:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
 ; AVX2-NEXT:    jmp .LBB80_18
 ; AVX2-NEXT:  .LBB80_16:
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shrq %rcx
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm4
-; AVX2-NEXT:    vaddss %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX2-NEXT:    vaddss %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:  .LBB80_18:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_19
 ; AVX2-NEXT:  # BB#20:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm5
 ; AVX2-NEXT:    jmp .LBB80_21
 ; AVX2-NEXT:  .LBB80_19:
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -4554,25 +4554,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
 ; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm5
 ; AVX2-NEXT:  .LBB80_21:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX2-NEXT:    testq %rax, %rax
 ; AVX2-NEXT:    js .LBB80_22
 ; AVX2-NEXT:  # BB#23:
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
 ; AVX2-NEXT:    jmp .LBB80_24
 ; AVX2-NEXT:  .LBB80_22:
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shrq %rcx
 ; AVX2-NEXT:    andl $1, %eax
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm1
-; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:  .LBB80_24:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index 2fb821555dba..226c0adbaf3c 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpsrlq $24, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm3
+; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm2
-; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vporq %zmm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vpsllq $8, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpsllq $24, %zmm0, %zmm3
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index f0a5fe1dbfff..a05a981daa1f 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pandn %xmm5, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    pandn %xmm4, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_logic_v8i32:
@@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    pand %xmm1, %xmm3
 ; SSSE3-NEXT:    pandn %xmm5, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    pand %xmm0, %xmm2
 ; SSSE3-NEXT:    pandn %xmm4, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: blend_logic_v8i32:
diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll
index 8081e9482d67..c5ac4466b5fa 100644
--- a/test/CodeGen/X86/vector-sqrt.ll
+++ b/test/CodeGen/X86/vector-sqrt.ll
@@ -29,11 +29,11 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
 ; CHECK-NEXT:    vsqrtss 4(%rdi), %xmm1, %xmm1
+; CHECK-NEXT:    vsqrtss 8(%rdi), %xmm2, %xmm2
+; CHECK-NEXT:    vsqrtss 12(%rdi), %xmm3, %xmm3
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; CHECK-NEXT:    vsqrtss 8(%rdi), %xmm2, %xmm1
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; CHECK-NEXT:    vsqrtss 12(%rdi), %xmm2, %xmm1
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; CHECK-NEXT:    retq
 entry:
   %0 = load float, float* %v, align 4
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 450e255313b3..6fbec91e77a3 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
-; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
 ; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm1
+; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
 ; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
 ; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
 ; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
+; AVX-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
 ; AVX-NEXT:    retq
   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -124,9 +124,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm1
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
   %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index aed305058f0b..03f284d87a66 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -712,17 +712,11 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 ;
 ; KNL-LABEL: bug27873:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    andl $1, %esi
 ; KNL-NEXT:    movl $160, %ecx
 ; KNL-NEXT:    movq %rdi, %rax
 ; KNL-NEXT:    mulq %rcx
-; KNL-NEXT:    kmovw %esi, %k0
 ; KNL-NEXT:    seto %al
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    orb %sil, %al
 ; KNL-NEXT:    retq
   %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
   %mul.overflow = extractvalue { i64, i1 } %mul, 1
diff --git a/test/CodeGen/X86/xor-select-i1-combine.ll b/test/CodeGen/X86/xor-select-i1-combine.ll
index 6507ddcc7697..c9383282a0cc 100644
--- a/test/CodeGen/X86/xor-select-i1-combine.ll
+++ b/test/CodeGen/X86/xor-select-i1-combine.ll
@@ -7,10 +7,10 @@
 define i32 @main(i8 %small) {
 ; CHECK-LABEL: main:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movl $n, %eax
-; CHECK-NEXT:    movl $m, %ecx
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmovneq %rax, %rcx
+; CHECK-NEXT:    movl $m, %eax
+; CHECK-NEXT:    movl $n, %ecx
+; CHECK-NEXT:    cmoveq %rax, %rcx
 ; CHECK-NEXT:    movl (%rcx), %eax
 ; CHECK-NEXT:    retq
 entry:
diff --git a/test/DebugInfo/Inputs/split-dwarf-addr-object-relocation.dwo b/test/DebugInfo/Inputs/split-dwarf-addr-object-relocation.dwo
new file mode 100644
index 000000000000..2a3bc57caa6d
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-addr-object-relocation.dwo
diff --git a/test/DebugInfo/Inputs/split-dwarf-addr-object-relocation.o b/test/DebugInfo/Inputs/split-dwarf-addr-object-relocation.o
new file mode 100644
index 000000000000..b6993c6cae20
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-addr-object-relocation.o
diff --git a/test/DebugInfo/Inputs/split-dwarf-multiple-cu.dwo b/test/DebugInfo/Inputs/split-dwarf-multiple-cu.dwo
new file mode 100644
index 000000000000..4df9894b089a
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-multiple-cu.dwo
diff --git a/test/DebugInfo/Inputs/split-dwarf-multiple-cu.o b/test/DebugInfo/Inputs/split-dwarf-multiple-cu.o
new file mode 100644
index 000000000000..aa4ab4bc76f7
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-multiple-cu.o
diff --git a/test/DebugInfo/PDB/Inputs/merge1.yaml b/test/DebugInfo/PDB/Inputs/merge1.yaml
new file mode 100644
index 000000000000..89d471e3343d
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge1.yaml
@@ -0,0 +1,52 @@
+---
+TpiStream:
+  Records:
+    # uint32_t* [Index: 0x1000]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    117
+        Attrs:           32778
+    # int64_t* [Index: 0x1001]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    118
+        Attrs:           32778
+    # struct OnlyInMerge1 [Index: 0x1002]
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     0
+        Options:         [ None, ForwardReference, HasUniqueName ]
+        FieldList:       0
+        Name:            'OnlyInMerge1'
+        UniqueName:      'OnlyInMerge1'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            0
+    # uint32_t** [Index: 0x1003]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4096
+        Attrs:           32778
+    # uint32_t*** [Index: 0x1004]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4099
+        Attrs:           32778
+    # int64_t* [Index: 0x1005]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4097
+        Attrs:           32778
+    # [uint32_t, uint32_t*, uint32_t**] [Index: 0x1006]
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 117, 4096, 4099 ]
+    # uint32_t (uint32_t, uint32_t*, uint32_t**) [Index: 0x1007]
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      117
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  0
+        ArgumentList:    4102
+...
diff --git a/test/DebugInfo/PDB/Inputs/merge2.yaml b/test/DebugInfo/PDB/Inputs/merge2.yaml
new file mode 100644
index 000000000000..b6cbdb98f0ca
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge2.yaml
@@ -0,0 +1,52 @@
+---
+TpiStream:
+  Records:         
+    # uint32_t* [Index: 0x1000]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    117     
+        Attrs:           32778
+    # uint32_t** [Index: 0x1001]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4096    
+        Attrs:           32778
+    # uint32_t*** [Index: 0x1002]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4097    
+        Attrs:           32778
+    # [uint32_t, uint32_t*, uint32_t**] [Index: 0x1003]
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 117, 4096, 4097 ]
+    # uint32_t (uint32_t, uint32_t*, uint32_t**) [Index: 0x1004]
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      117
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  0
+        ArgumentList:    4099
+    # int64_t* [Index: 0x1005]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    118     
+        Attrs:           32778
+    # int64_t** [Index: 0x1006]
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4101
+        Attrs:           32778
+    # struct OnlyInMerge2 [Index: 0x1007]
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     0
+        Options:         [ None, ForwardReference, HasUniqueName ]
+        FieldList:       0
+        Name:            'OnlyInMerge2'
+        UniqueName:      'OnlyInMerge2'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            0
+...
diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test
index d67743efd707..4e6bb75f8b8d 100644
--- a/test/DebugInfo/PDB/pdbdump-headers.test
+++ b/test/DebugInfo/PDB/pdbdump-headers.test
@@ -326,7 +326,7 @@
 ; EMPTY-NEXT:           TypeLeafKind: LF_SUBSTR_LIST (0x1604)
 ; EMPTY-NEXT:           NumStrings: 1
 ; EMPTY-NEXT:           Strings [
-; EMPTY-NEXT:             String: __vc_attributes::threadingAttribute (0x100B)
+; EMPTY-NEXT:             String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
 ; EMPTY-NEXT:           ]
 ; EMPTY-NEXT:         }
 ; EMPTY-NEXT:         Bytes (
@@ -1253,7 +1253,7 @@
 ; ALL:         TypeLeafKind: LF_SUBSTR_LIST (0x1604)
 ; ALL:         NumStrings: 1
 ; ALL:         Strings [
-; ALL:           String: __vc_attributes::threadingAttribute (0x100B)
+; ALL:           String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows (0x100B)
 ; ALL:         ]
 ; ALL:       }
 ; ALL:     }
diff --git a/test/DebugInfo/PDB/pdbdump-mergetypes.test b/test/DebugInfo/PDB/pdbdump-mergetypes.test
new file mode 100644
index 000000000000..96f6316d4766
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-mergetypes.test
@@ -0,0 +1,24 @@
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge2.yaml
+; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=MERGED %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=ARGLIST %s
+
+
+MERGED: Type Info Stream (TPI)
+MERGED: Record count: 9
+MERGED-DAG: PointeeType: unsigned
+MERGED-DAG: PointeeType: unsigned*
+MERGED-DAG: PointeeType: unsigned**
+MERGED-DAG: PointeeType: __int64
+MERGED-DAG: PointeeType: __int64*
+MERGED-DAG: Name: OnlyInMerge1
+MERGED-DAG: Name: OnlyInMerge2
+MERGED-DAG: TypeLeafKind: LF_ARGLIST
+
+ARGLIST: TypeLeafKind: LF_ARGLIST
+ARGLIST-NEXT: NumArgs: 3
+ARGLIST-NEXT: Arguments [
+ARGLIST-NEXT: ArgType: unsigned
+ARGLIST-NEXT: ArgType: unsigned*
+ARGLIST-NEXT: ArgType: unsigned**
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 7ea062e6c9e7..f0db8f4b921f 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -23,6 +23,10 @@ RUN: cp %p/Inputs/split-dwarf-test.dwo %T
 RUN: echo "%p/Inputs/split-dwarf-test 0x4005d4" >> %t.input
 RUN: echo "%p/Inputs/split-dwarf-test 0x4005c4" >> %t.input
 RUN: echo "%p/Inputs/cross-cu-inlining.x86_64-macho.o 0x17" >> %t.input
+RUN: cp %p/Inputs/split-dwarf-multiple-cu.dwo %T
+RUN: echo "%p/Inputs/split-dwarf-multiple-cu.o 0x4" >> %t.input
+RUN: cp %p/Inputs/split-dwarf-addr-object-relocation.dwo %T
+RUN: echo "%p/Inputs/split-dwarf-addr-object-relocation.o 0x14" >> %t.input
 
 RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck --check-prefix=CHECK --check-prefix=SPLIT --check-prefix=DWO %s
@@ -133,6 +137,16 @@ CHECK-NEXT: /tmp{{[/\\]}}cross-cu-inlining.c:16:3
 CHECK-NEXT: main
 CHECK-NEXT: /tmp{{[/\\]}}cross-cu-inlining.c:11:0
 
+CHECK:      f2
+CHECK-NEXT: b.cpp:3:3
+CHECK-NEXT: f3
+CHECK-NEXT: b.cpp:6:0
+
+CHECK:      f2
+CHECK-NEXT: split-dwarf-addr-object-relocation.cpp:3:3
+CHECK-NEXT: f3
+CHECK-NEXT: split-dwarf-addr-object-relocation.cpp:6:0
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2 2>&1 | FileCheck %s --check-prefix=MISSING-FILE
 
diff --git a/test/Instrumentation/MemorySanitizer/csr.ll b/test/Instrumentation/MemorySanitizer/csr.ll
index c4e3a3f73920..c288f93241b9 100644
--- a/test/Instrumentation/MemorySanitizer/csr.ll
+++ b/test/Instrumentation/MemorySanitizer/csr.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
 ; RUN: opt < %s -msan -msan-check-access-address=1 -S | FileCheck %s --check-prefix=ADDR
+; REQUIRES: x86
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/LTO/X86/Inputs/strip-debug-info-bar.ll b/test/LTO/X86/Inputs/strip-debug-info-bar.ll
new file mode 100644
index 000000000000..4269886676b3
--- /dev/null
+++ b/test/LTO/X86/Inputs/strip-debug-info-bar.ll
@@ -0,0 +1,15 @@
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12"
+
+define void @bar() !dbg !3 {
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2)
+!2 = !DIFile(filename: "broken", directory: "")
+!3 = distinct !DISubprogram(line: 1000, isDefinition: true)
+
diff --git a/test/LTO/X86/Inputs/strip-debug-info.bc b/test/LTO/X86/Inputs/strip-debug-info.bc
deleted file mode 100644
index c83195ff9caf..000000000000
--- a/test/LTO/X86/Inputs/strip-debug-info.bc
+++ /dev/null
diff --git a/test/LTO/X86/strip-debug-info.ll b/test/LTO/X86/strip-debug-info.ll
index ff45ca15243e..6b7745164446 100644
--- a/test/LTO/X86/strip-debug-info.ll
+++ b/test/LTO/X86/strip-debug-info.ll
@@ -1,16 +1,61 @@
+; RUN: llvm-as -disable-verify %s -o %t.bc
+; ---- Full LTO ---------------------------------------------
 ; RUN: not llvm-lto -lto-strip-invalid-debug-info=false \
-; RUN:     -o %t.o %S/Inputs/strip-debug-info.bc 2>&1 | \
+; RUN:     -o %t.o %t.bc 2>&1 | \
 ; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-ERR
 ; RUN: llvm-lto -lto-strip-invalid-debug-info=true \
 ; RUN:     -exported-symbol foo -exported-symbol _foo \
-; RUN:     -o %t.o %S/Inputs/strip-debug-info.bc 2>&1 | \
+; RUN:     -o %t.o %t.bc 2>&1 | \
 ; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-WARN
 ; RUN: llvm-nm %t.o | FileCheck %s 
+; ---- Thin LTO (codegen only) ------------------------------
+; RUN: not llvm-lto -thinlto -thinlto-action=codegen \
+; RUN:     -lto-strip-invalid-debug-info=false \
+; RUN:     %t.bc -disable-verify 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-ERR
+; RUN: llvm-lto -thinlto -thinlto-action=codegen \
+; RUN:     -lto-strip-invalid-debug-info=true \
+; RUN:     %t.bc -disable-verify 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-WARN
+; ---- Thin LTO (optimize, strip main file) -----------------
+; RUN: opt -disable-verify -module-summary %s -o %t.bc
+; RUN: opt -disable-verify -module-summary %S/Inputs/strip-debug-info-bar.ll \
+; RUN:     -o %t2.bc
+; RUN: not llvm-lto -thinlto -thinlto-action=run \
+; RUN:     -lto-strip-invalid-debug-info=false \
+; RUN:     %t.bc -disable-verify 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-ERR
+; RUN: llvm-lto -thinlto -thinlto-action=run \
+; RUN:     -lto-strip-invalid-debug-info=true \
+; RUN:     %t.bc -disable-verify 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-WARN
+; ---- Thin LTO (optimize, strip imported file) -------------
+; RUN: opt -disable-verify -strip-debug -module-summary %t.bc -o %t-stripped.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t.index.bc %t-stripped.bc %t2.bc
+; RUN: not llvm-lto -thinlto -thinlto-action=import \
+; RUN:     -thinlto-index=%t.index.bc \
+; RUN:     -lto-strip-invalid-debug-info=false \
+; RUN:     -exported-symbol foo -exported-symbol _foo \
+; RUN:     %t-stripped.bc -disable-verify 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-ERR
+; RUN: llvm-lto -thinlto -thinlto-action=import \
+; RUN:     -lto-strip-invalid-debug-info=true \
+; RUN:     -thinlto-index=%t.index.bc \
+; RUN:     -exported-symbol foo -exported-symbol _foo \
+; RUN:     %t-stripped.bc -disable-verify 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-WARN
 
 ; CHECK-ERR: Broken module found, compilation aborted
 ; CHECK-WARN: Invalid debug info found, debug info will be stripped
+; CHECK-WARN-NOT: Broken module found
 ; CHECK: foo
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12"
+
+declare void @bar()
+
 define void @foo() {
+  call void @bar()
   ret void
 }
 
diff --git a/test/MC/AMDGPU/exp.s b/test/MC/AMDGPU/exp.s
index 710a777ab217..fab89e48dcfa 100644
--- a/test/MC/AMDGPU/exp.s
+++ b/test/MC/AMDGPU/exp.s
@@ -112,3 +112,15 @@ exp mrt0 v4, v3, v2, v1 vm
 exp mrt0 v4, v3, v2, v1 done vm
 // SI: exp mrt0 v4, v3, v2, v1 done vm ; encoding: [0x0f,0x18,0x00,0xf8,0x04,0x03,0x02,0x01]
 // VI: exp mrt0 v4, v3, v2, v1 done vm ; encoding: [0x0f,0x18,0x00,0xc4,0x04,0x03,0x02,0x01]
+
+exp mrtz, v3, v3, v7, v7 compr
+// SI: exp mrtz v3, v3, v7, v7 compr   ; encoding: [0x8f,0x04,0x00,0xf8,0x03,0x07,0x00,0x00]
+// VI: exp mrtz v3, v3, v7, v7 compr   ; encoding: [0x8f,0x04,0x00,0xc4,0x03,0x07,0x00,0x00]
+
+exp mrtz, off, off, v7, v7 compr
+// SI: exp mrtz off, off, v7, v7 compr ; encoding: [0x8c,0x04,0x00,0xf8,0x00,0x07,0x00,0x00]
+// VI: exp mrtz off, off, v7, v7 compr ; encoding: [0x8c,0x04,0x00,0xc4,0x00,0x07,0x00,0x00]
+
+exp mrtz, v3, v3, off, off compr
+// SI: exp mrtz v3, v3, off, off compr ; encoding: [0x83,0x04,0x00,0xf8,0x03,0x00,0x00,0x00]
+// VI: exp mrtz v3, v3, off, off compr ; encoding: [0x83,0x04,0x00,0xc4,0x03,0x00,0x00,0x00]
diff --git a/test/MC/Disassembler/AMDGPU/exp_vi.txt b/test/MC/Disassembler/AMDGPU/exp_vi.txt
new file mode 100644
index 000000000000..9291fb807839
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/exp_vi.txt
@@ -0,0 +1,40 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck %s -check-prefix=VI
+
+# VI: exp mrt0 v1, v2, v3, v4    ; encoding: [0x0f,0x00,0x00,0xc4,0x01,0x02,0x03,0x04]
+0x0f,0x00,0x00,0xc4,0x01,0x02,0x03,0x04
+
+# VI: exp mrt0 v1, v2, v3, v4 vm    ; encoding: [0x0f,0x10,0x00,0xc4,0x01,0x02,0x03,0x04]
+0x0f,0x10,0x00,0xc4,0x01,0x02,0x03,0x04
+
+# VI: exp mrt0 v1, v1, v3, v3 compr    ; encoding: [0x0f,0x04,0x00,0xc4,0x01,0x03,0x00,0x00]
+0x0f,0x04,0x00,0xc4,0x01,0x03,0x00,0x00
+
+# VI: exp mrt0 v1, v2, v3, v4 done    ; encoding: [0x0f,0x08,0x00,0xc4,0x01,0x02,0x03,0x04]
+0x0f,0x08,0x00,0xc4,0x01,0x02,0x03,0x04
+
+# VI: exp mrt0 v2, v2, v4, v4 done compr vm    ; encoding: [0x0f,0x1c,0x00,0xc4,0x02,0x04,0x00,0x00]
+0x0f,0x1c,0x00,0xc4,0x02,0x04,0x00,0x00
+
+# VI: exp mrt0 v7, off, off, off vm    ; encoding: [0x01,0x10,0x00,0xc4,0x07,0x00,0x00,0x00]
+0x01,0x10,0x00,0xc4,0x07,0x00,0x00,0x00
+
+# VI: exp mrt0 off, off, v1, v2    ; encoding: [0x0c,0x00,0x00,0xc4,0x00,0x00,0x01,0x02]
+0x0c,0x00,0x00,0xc4,0x00,0x00,0x01,0x02
+
+# VI: exp mrt0 off, off, v8, v8 done compr    ; encoding: [0x0c,0x0c,0x00,0xc4,0x00,0x08,0x00,0x00]
+0x0c,0x0c,0x00,0xc4,0x00,0x08,0x00,0x00
+
+# VI: exp mrt0 v1, v1, off, off compr   ; encoding: [0x03,0x04,0x00,0xc4,0x01,0x00,0x00,0x00]
+0x03,0x04,0x00,0xc4,0x01,0x00,0x00,0x00
+
+# VI: exp param0 off, off, off, off compr    ; encoding: [0x00,0x06,0x00,0xc4,0x00,0x00,0x00,0x00]
+0x00,0x06,0x00,0xc4,0x00,0x00,0x00,0x00
+
+# VI: exp mrtz v0, off, off, off done vm    ; encoding: [0x81,0x18,0x00,0xc4,0x00,0x00,0x00,0x00]
+0x81,0x18,0x00,0xc4,0x00,0x00,0x00,0x00
+
+# VI: exp null v255, v0, v255, v0    ; encoding: [0x9f,0x00,0x00,0xc4,0xff,0x00,0xff,0x00]
+0x9f,0x00,0x00,0xc4,0xff,0x00,0xff,0x00
+
+# VI: exp pos0 v1, off, off, off    ; encoding: [0xc1,0x00,0x00,0xc4,0x01,0x00,0x00,0x00]
+0xc1,0x00,0x00,0xc4,0x01,0x00,0x00,0x00
diff --git a/test/MC/Disassembler/AMDGPU/sopc_vi.txt b/test/MC/Disassembler/AMDGPU/sopc_vi.txt
index 026dcbafed42..2c2dc07efd65 100644
--- a/test/MC/Disassembler/AMDGPU/sopc_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/sopc_vi.txt
@@ -50,3 +50,6 @@
 
 # GCN: s_setvskip s3, s5 ; encoding: [0x03,0x05,0x10,0xbf]
 0x03 0x05 0x10 0xbf
+
+# GCN: s_bitcmp0_b32 0xafaaffff, 0xafaaffff ; encoding: [0xff,0xff,0x0c,0xbf,0xff,0xff,0xaa,0xaf]
+0xff 0xff 0x0c 0xbf 0xff 0xff 0xaa 0xaf
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index 2784e937954a..aeac85962f63 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -7,10 +7,6 @@ include "llvm/Target/Target.td"
 def MyTargetISA : InstrInfo;
 def MyTarget : Target { let InstructionSet = MyTargetISA; }
 
-let TargetPrefix = "mytarget" in {
-def int_mytarget_nop : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-}
-
 def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
 def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
 def GPR32Op : RegisterOperand<GPR32>;
@@ -131,37 +127,6 @@ def : Pat<(select GPR32:$src1, complex:$src2, complex:$src3),
 def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2),
             [(set GPR32:$dst, (add GPR32:$src1, GPR32:$src2))]>;
 
-//===- Test a simple pattern with an intrinsic. ---------------------------===//
-//
-
-// CHECK-LABEL: if ([&]() {
-// CHECK-NEXT:    MachineInstr &MI0 = I;
-// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
-// CHECK-NEXT:      return false;
-// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_INTRINSIC) &&
-// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
-// CHECK-NEXT:        ((/* Operand 1 */ (isOperandImmEqual(MI0.getOperand(1), [[ID:[0-9]+]], MRI)))) &&
-// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
-// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
-// CHECK-NEXT:      // (intrinsic_wo_chain:i32 [[ID]]:iPTR, GPR32:i32:$src1) => (MOV:i32 GPR32:i32:$src1)
-// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MOV));
-// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
-// CHECK-NEXT:      MIB.add(MI0.getOperand(2)/*src1*/);
-// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
-// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
-// CHECK-NEXT:          MIB.addMemOperand(MMO);
-// CHECK-NEXT:      I.eraseFromParent();
-// CHECK-NEXT:      MachineInstr &NewI = *MIB;
-// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
-// CHECK-NEXT:      return true;
-// CHECK-NEXT:    }
-// CHECK-NEXT:    return false;
-// CHECK-NEXT:  }()) { return true; }
-
-def MOV : I<(outs GPR32:$dst), (ins GPR32:$src1),
-            [(set GPR32:$dst, (int_mytarget_nop GPR32:$src1))]>;
-
 //===- Test a nested instruction match. -----------------------------------===//
 
 // CHECK-LABEL: if ([&]() {
diff --git a/test/TableGen/intrinsic-varargs.td b/test/TableGen/intrinsic-varargs.td
index 1e2378550855..b4ce10c64e22 100644
--- a/test/TableGen/intrinsic-varargs.td
+++ b/test/TableGen/intrinsic-varargs.td
@@ -23,7 +23,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> {
 }
 
 // isVoid needs to match the definition in ValueTypes.td
-def isVoid : ValueType<0, 108>;   // Produces no value
+def isVoid : ValueType<0, 110>;   // Produces no value
 def llvm_vararg_ty : LLVMType<isVoid>;   // this means vararg here
 
 // CHECK: /* 0 */ 0, 29, 0,
diff --git a/test/ThinLTO/X86/Inputs/merge-triple.ll b/test/ThinLTO/X86/Inputs/merge-triple.ll
new file mode 100644
index 000000000000..ea644f5497b9
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/merge-triple.ll
@@ -0,0 +1 @@
+target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/ThinLTO/X86/merge-triple.ll b/test/ThinLTO/X86/merge-triple.ll
new file mode 100644
index 000000000000..8f099d12a23b
--- /dev/null
+++ b/test/ThinLTO/X86/merge-triple.ll
@@ -0,0 +1,10 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/merge-triple.ll -o %t2.bc
+; RUN: llvm-lto -thinlto-action=optimize %t1.bc %t2.bc
+; RUN: llvm-dis < %t1.bc.thinlto.imported.bc | FileCheck %s --check-prefix=CHECK1
+; RUN: llvm-dis < %t2.bc.thinlto.imported.bc | FileCheck %s --check-prefix=CHECK2
+
+target triple = "x86_64-apple-macosx10.12.0"
+
+; CHECK1: target triple = "x86_64-apple-macosx10.12.0"
+; CHECK2: target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll b/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll
deleted file mode 100644
index 4d00d495a07f..000000000000
--- a/test/Transforms/InstCombine/2008-09-29-FoldingOr.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep "or i1"
-; PR2844
-
-define i32 @test(i32 %p_74) {
-	%A = icmp eq i32 %p_74, 0		; <i1> [#uses=1]
-	%B = icmp slt i32 %p_74, -638208501		; <i1> [#uses=1]
-	%or.cond = or i1 %A, %B		; <i1> [#uses=1]
-	%iftmp.10.0 = select i1 %or.cond, i32 0, i32 1		; <i32> [#uses=1]
-	ret i32 %iftmp.10.0
-}
diff --git a/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 0c4842c15988..0c4842c15988 100644
--- a/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
diff --git a/test/Transforms/InstCombine/NVPTX/lit.local.cfg b/test/Transforms/InstCombine/NVPTX/lit.local.cfg
new file mode 100644
index 000000000000..2cb98eb371b2
--- /dev/null
+++ b/test/Transforms/InstCombine/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/InstCombine/nvvm-intrins.ll b/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
index cb65b8fdc547..cb65b8fdc547 100644
--- a/test/Transforms/InstCombine/nvvm-intrins.ll
+++ b/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index a4375a5cd57e..486a617097e1 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -1470,3 +1470,55 @@ define i32 @test93(i32 %A) {
   %D = trunc i96 %C to i32
   ret i32 %D
 }
+
+; The following four tests sext + lshr + trunc patterns.
+; PR33078
+
+define i8 @pr33078_1(i8 %A) {
+; CHECK-LABEL: @pr33078_1(
+; CHECK-NEXT:    [[C:%.*]] = ashr i8 [[A:%.*]], 7
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %B = sext i8 %A to i16
+  %C = lshr i16 %B, 8
+  %D = trunc i16 %C to i8
+  ret i8 %D
+}
+
+define i12 @pr33078_2(i8 %A) {
+; CHECK-LABEL: @pr33078_2(
+; CHECK-NEXT:    [[C:%.*]] = ashr i8 [[A:%.*]], 4
+; CHECK-NEXT:    [[D:%.*]] = sext i8 [[C]] to i12
+; CHECK-NEXT:    ret i12 [[D]]
+;
+  %B = sext i8 %A to i16
+  %C = lshr i16 %B, 4
+  %D = trunc i16 %C to i12
+  ret i12 %D
+}
+
+define i4 @pr33078_3(i8 %A) {
+; CHECK-LABEL: @pr33078_3(
+; CHECK-NEXT:    [[B:%.*]] = sext i8 [[A:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = lshr i16 [[B]], 12
+; CHECK-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i4
+; CHECK-NEXT:    ret i4 [[D]]
+;
+  %B = sext i8 %A to i16
+  %C = lshr i16 %B, 12
+  %D = trunc i16 %C to i4
+  ret i4 %D
+}
+
+define i8 @pr33078_4(i3 %x) {
+; Don't turn this in an `ashr`. This was getting miscompiled
+; CHECK-LABEL: @pr33078_4(
+; CHECK-NEXT:    [[B:%.*]] = sext i3 %x to i16
+; CHECK-NEXT:    [[C:%.*]] = lshr i16 [[B]], 13
+; CHECK-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i8
+; CHECK-NEXT:    ret i8 [[D]]
+  %B = sext i3 %x to i16
+  %C = lshr i16 %B, 13
+  %D = trunc i16 %C to i8
+  ret i8 %D
+}
diff --git a/test/Transforms/InstCombine/lshr.ll b/test/Transforms/InstCombine/lshr.ll
index b81371b03042..0cad7f833ab6 100644
--- a/test/Transforms/InstCombine/lshr.ll
+++ b/test/Transforms/InstCombine/lshr.ll
@@ -100,3 +100,75 @@ define <2 x i8> @lshr_exact_splat_vec(<2 x i8> %x) {
   ret <2 x i8> %lshr
 }
 
+; FIXME: The bool bit got smeared across a wide val, but then we zero'd out those bits. This is just a zext.
+
+define i16 @bool_zext(i1 %x) {
+; CHECK-LABEL: @bool_zext(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 %x to i16
+; CHECK-NEXT:    [[HIBIT:%.*]] = lshr i16 [[SEXT]], 15
+; CHECK-NEXT:    ret i16 [[HIBIT]]
+;
+  %sext = sext i1 %x to i16
+  %hibit = lshr i16 %sext, 15
+  ret i16 %hibit
+}
+
+define <2 x i8> @bool_zext_splat(<2 x i1> %x) {
+; CHECK-LABEL: @bool_zext_splat(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> %x to <2 x i8>
+; CHECK-NEXT:    [[HIBIT:%.*]] = lshr <2 x i8> [[SEXT]], <i8 7, i8 7>
+; CHECK-NEXT:    ret <2 x i8> [[HIBIT]]
+;
+  %sext = sext <2 x i1> %x to <2 x i8>
+  %hibit = lshr <2 x i8> %sext, <i8 7, i8 7>
+  ret <2 x i8> %hibit
+}
+
+; FIXME: The replicated sign bits are all that's left. This could be ashr+zext.
+
+define i16 @smear_sign_and_widen(i4 %x) {
+; CHECK-LABEL: @smear_sign_and_widen(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i4 %x to i16
+; CHECK-NEXT:    [[HIBIT:%.*]] = lshr i16 [[SEXT]], 12
+; CHECK-NEXT:    ret i16 [[HIBIT]]
+;
+  %sext = sext i4 %x to i16
+  %hibit = lshr i16 %sext, 12
+  ret i16 %hibit
+}
+
+define <2 x i8> @smear_sign_and_widen_splat(<2 x i6> %x) {
+; CHECK-LABEL: @smear_sign_and_widen_splat(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i6> %x to <2 x i8>
+; CHECK-NEXT:    [[HIBIT:%.*]] = lshr <2 x i8> [[SEXT]], <i8 2, i8 2>
+; CHECK-NEXT:    ret <2 x i8> [[HIBIT]]
+;
+  %sext = sext <2 x i6> %x to <2 x i8>
+  %hibit = lshr <2 x i8> %sext, <i8 2, i8 2>
+  ret <2 x i8> %hibit
+}
+
+; FIXME: All of the replicated sign bits are wiped out by the lshr. This could be lshr+zext.
+
+define i16 @fake_sext(i3 %x) {
+; CHECK-LABEL: @fake_sext(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i3 %x to i16
+; CHECK-NEXT:    [[SH:%.*]] = lshr i16 [[SEXT]], 15
+; CHECK-NEXT:    ret i16 [[SH]]
+;
+  %sext = sext i3 %x to i16
+  %sh = lshr i16 %sext, 15
+  ret i16 %sh
+}
+
+define <2 x i8> @fake_sext_splat(<2 x i3> %x) {
+; CHECK-LABEL: @fake_sext_splat(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i3> %x to <2 x i8>
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[SEXT]], <i8 7, i8 7>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
+;
+  %sext = sext <2 x i3> %x to <2 x i8>
+  %sh = lshr <2 x i8> %sext, <i8 7, i8 7>
+  ret <2 x i8> %sh
+}
+
diff --git a/test/Transforms/InstCombine/memchr.ll b/test/Transforms/InstCombine/memchr.ll
index b0573567bf60..5a081c222fb0 100644
--- a/test/Transforms/InstCombine/memchr.ll
+++ b/test/Transforms/InstCombine/memchr.ll
@@ -190,3 +190,12 @@ define i1 @test15(i32 %C) {
   %cmp = icmp ne i8* %dst, null
   ret i1 %cmp
 }
+
+@s = internal constant [1 x i8] [i8 0], align 1
+define i8* @pr32124() {
+; CHECK-LABEL: @pr32124(
+; CHECK-NEXT:    ret i8* getelementptr inbounds ([1 x i8], [1 x i8]* @s, i32 0, i32 0)
+;
+  %res = tail call i8* @memchr(i8* getelementptr ([1 x i8], [1 x i8]* @s, i64 0, i64 0), i32 0, i32 1)
+  ret i8* %res
+}
diff --git a/test/Transforms/InstCombine/set.ll b/test/Transforms/InstCombine/set.ll
index 494a60379011..db2b4c3558e8 100644
--- a/test/Transforms/InstCombine/set.ll
+++ b/test/Transforms/InstCombine/set.ll
@@ -110,8 +110,8 @@ define i1 @test12(i1 %A) {
 
 define i1 @test13(i1 %A, i1 %B) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[CTMP:%.*]] = xor i1 %B, true
-; CHECK-NEXT:    [[C:%.*]] = or i1 [[CTMP]], %A
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 %B, true
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[TMP1]], %A
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %C = icmp uge i1 %A, %B
@@ -120,8 +120,8 @@ define i1 @test13(i1 %A, i1 %B) {
 
 define <2 x i1> @test13vec(<2 x i1> %A, <2 x i1> %B) {
 ; CHECK-LABEL: @test13vec(
-; CHECK-NEXT:    [[CTMP:%.*]] = xor <2 x i1> %B, <i1 true, i1 true>
-; CHECK-NEXT:    [[C:%.*]] = or <2 x i1> [[CTMP]], %A
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> %B, <i1 true, i1 true>
+; CHECK-NEXT:    [[C:%.*]] = or <2 x i1> [[TMP1]], %A
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %C = icmp uge <2 x i1> %A, %B
@@ -130,8 +130,8 @@ define <2 x i1> @test13vec(<2 x i1> %A, <2 x i1> %B) {
 
 define i1 @test14(i1 %A, i1 %B) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    [[CTMP:%.*]] = xor i1 %A, %B
-; CHECK-NEXT:    [[C:%.*]] = xor i1 [[CTMP]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 %A, %B
+; CHECK-NEXT:    [[C:%.*]] = xor i1 [[TMP1]], true
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %C = icmp eq i1 %A, %B
@@ -140,14 +140,88 @@ define i1 @test14(i1 %A, i1 %B) {
 
 define <3 x i1> @test14vec(<3 x i1> %A, <3 x i1> %B) {
 ; CHECK-LABEL: @test14vec(
-; CHECK-NEXT:    [[CTMP:%.*]] = xor <3 x i1> %A, %B
-; CHECK-NEXT:    [[C:%.*]] = xor <3 x i1> [[CTMP]], <i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <3 x i1> %A, %B
+; CHECK-NEXT:    [[C:%.*]] = xor <3 x i1> [[TMP1]], <i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    ret <3 x i1> [[C]]
 ;
   %C = icmp eq <3 x i1> %A, %B
   ret <3 x i1> %C
 }
 
+define i1 @bool_eq0(i64 %a) {
+; CHECK-LABEL: @bool_eq0(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 %a, 1
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %b = icmp sgt i64 %a, 0
+  %c = icmp eq i64 %a, 1
+  %notc = icmp eq i1 %c, false
+  %and = and i1 %b, %notc
+  ret i1 %and
+}
+
+; FIXME: This is equivalent to the previous test.
+
+define i1 @xor_of_icmps(i64 %a) {
+; CHECK-LABEL: @xor_of_icmps(
+; CHECK-NEXT:    [[B:%.*]] = icmp sgt i64 %a, 0
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 %a, 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[C]], [[B]]
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
+  %b = icmp sgt i64 %a, 0
+  %c = icmp eq i64 %a, 1
+  %xor = xor i1 %c, %b
+  ret i1 %xor
+}
+
+; FIXME: This is also equivalent to the previous test.
+
+define i1 @xor_of_icmps_commute(i64 %a) {
+; CHECK-LABEL: @xor_of_icmps_commute(
+; CHECK-NEXT:    [[B:%.*]] = icmp sgt i64 %a, 0
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 %a, 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[B]], [[C]]
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
+  %b = icmp sgt i64 %a, 0
+  %c = icmp eq i64 %a, 1
+  %xor = xor i1 %b, %c
+  ret i1 %xor
+}
+
+; FIXME: This is (a != 5).
+
+define i1 @xor_of_icmps_folds_more(i64 %a) {
+; CHECK-LABEL: @xor_of_icmps_folds_more(
+; CHECK-NEXT:    [[B:%.*]] = icmp sgt i64 %a, 4
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 %a, 6
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[B]], [[C]]
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
+  %b = icmp sgt i64 %a, 4
+  %c = icmp slt i64 %a, 6
+  %xor = xor i1 %b, %c
+  ret i1 %xor
+}
+
+; https://bugs.llvm.org/show_bug.cgi?id=2844
+
+define i32 @PR2844(i32 %x) {
+; CHECK-LABEL: @PR2844(
+; CHECK-NEXT:    [[A:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    [[B:%.*]] = icmp sgt i32 %x, -638208502
+; CHECK-NEXT:    [[NOT_OR:%.*]] = xor i1 [[A]], [[B]]
+; CHECK-NEXT:    [[SEL:%.*]] = zext i1 [[NOT_OR]] to i32
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %A = icmp eq i32 %x, 0
+  %B = icmp slt i32 %x, -638208501
+  %or = or i1 %A, %B
+  %sel = select i1 %or, i32 0, i32 1
+  ret i32 %sel
+}
+
 define i1 @test16(i32 %A) {
 ; CHECK-LABEL: @test16(
 ; CHECK-NEXT:    ret i1 false
@@ -191,8 +265,8 @@ endif:
 
 define i1 @test19(i1 %A, i1 %B) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    [[CTMP:%.*]] = xor i1 %A, %B
-; CHECK-NEXT:    [[C:%.*]] = xor i1 [[CTMP]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 %A, %B
+; CHECK-NEXT:    [[C:%.*]] = xor i1 [[TMP1]], true
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a = zext i1 %A to i32
diff --git a/test/Transforms/InstCombine/wcslen-1.ll b/test/Transforms/InstCombine/wcslen-1.ll
new file mode 100644
index 000000000000..d4e51750f6da
--- /dev/null
+++ b/test/Transforms/InstCombine/wcslen-1.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test that the wcslen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @wcslen(i32*)
+
+@hello = constant [6 x i32] [i32 104, i32 101, i32 108, i32 108, i32 111, i32 0]
+@longer = constant [7 x i32] [i32 108, i32 111, i32 110, i32 103, i32 101, i32 114, i32 0]
+@null = constant [1 x i32] zeroinitializer
+@null_hello = constant [7 x i32] [i32 0, i32 104, i32 101, i32 108, i32 108, i32 111, i32 0]
+@nullstring = constant i32 0
+@a = common global [32 x i32] zeroinitializer, align 1
+@null_hello_mid = constant [13 x i32] [i32 104, i32 101, i32 108, i32 108, i32 111, i32 32, i32 119, i32 111, i32 114, i32 0, i32 108, i32 100, i32 0]
+
+define i64 @test_simplify1() {
+; CHECK-LABEL: @test_simplify1(
+; CHECK-NEXT:    ret i64 5
+;
+  %hello_p = getelementptr [6 x i32], [6 x i32]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  ret i64 %hello_l
+}
+
+define i64 @test_simplify2() {
+; CHECK-LABEL: @test_simplify2(
+; CHECK-NEXT:    ret i64 0
+;
+  %null_p = getelementptr [1 x i32], [1 x i32]* @null, i64 0, i64 0
+  %null_l = call i64 @wcslen(i32* %null_p)
+  ret i64 %null_l
+}
+
+define i64 @test_simplify3() {
+; CHECK-LABEL: @test_simplify3(
+; CHECK-NEXT:    ret i64 0
+;
+  %null_hello_p = getelementptr [7 x i32], [7 x i32]* @null_hello, i64 0, i64 0
+  %null_hello_l = call i64 @wcslen(i32* %null_hello_p)
+  ret i64 %null_hello_l
+}
+
+define i64 @test_simplify4() {
+; CHECK-LABEL: @test_simplify4(
+; CHECK-NEXT:    ret i64 0
+;
+  %len = tail call i64 @wcslen(i32* @nullstring) nounwind
+  ret i64 %len
+}
+
+; Check wcslen(x) == 0 --> *x == 0.
+
+define i1 @test_simplify5() {
+; CHECK-LABEL: @test_simplify5(
+; CHECK-NEXT:    ret i1 false
+;
+  %hello_p = getelementptr [6 x i32], [6 x i32]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  %eq_hello = icmp eq i64 %hello_l, 0
+  ret i1 %eq_hello
+}
+
+define i1 @test_simplify6(i32* %str_p) {
+; CHECK-LABEL: @test_simplify6(
+; CHECK-NEXT:    [[STRLENFIRST:%.*]] = load i32, i32* [[STR_P:%.*]], align 4
+; CHECK-NEXT:    [[EQ_NULL:%.*]] = icmp eq i32 [[STRLENFIRST]], 0
+; CHECK-NEXT:    ret i1 [[EQ_NULL]]
+;
+  %str_l = call i64 @wcslen(i32* %str_p)
+  %eq_null = icmp eq i64 %str_l, 0
+  ret i1 %eq_null
+}
+
+; Check wcslen(x) != 0 --> *x != 0.
+
+define i1 @test_simplify7() {
+; CHECK-LABEL: @test_simplify7(
+; CHECK-NEXT:    ret i1 true
+;
+  %hello_p = getelementptr [6 x i32], [6 x i32]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  %ne_hello = icmp ne i64 %hello_l, 0
+  ret i1 %ne_hello
+}
+
+define i1 @test_simplify8(i32* %str_p) {
+; CHECK-LABEL: @test_simplify8(
+; CHECK-NEXT:    [[STRLENFIRST:%.*]] = load i32, i32* [[STR_P:%.*]], align 4
+; CHECK-NEXT:    [[NE_NULL:%.*]] = icmp ne i32 [[STRLENFIRST]], 0
+; CHECK-NEXT:    ret i1 [[NE_NULL]]
+;
+  %str_l = call i64 @wcslen(i32* %str_p)
+  %ne_null = icmp ne i64 %str_l, 0
+  ret i1 %ne_null
+}
+
+define i64 @test_simplify9(i1 %x) {
+; CHECK-LABEL: @test_simplify9(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i64 5, i64 6
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %hello = getelementptr [6 x i32], [6 x i32]* @hello, i64 0, i64 0
+  %longer = getelementptr [7 x i32], [7 x i32]* @longer, i64 0, i64 0
+  %s = select i1 %x, i32* %hello, i32* %longer
+  %l = call i64 @wcslen(i32* %s)
+  ret i64 %l
+}
+
+; Check the case that should be simplified to a sub instruction.
+; wcslen(@hello + x) --> 5 - x
+
+define i64 @test_simplify10(i32 %x) {
+; CHECK-LABEL: @test_simplify10(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i64 5, [[TMP1]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %hello_p = getelementptr inbounds [6 x i32], [6 x i32]* @hello, i32 0, i32 %x
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  ret i64 %hello_l
+}
+
+; wcslen(@null_hello_mid + (x & 7)) --> 9 - (x & 7)
+
+define i64 @test_simplify11(i32 %x) {
+; CHECK-LABEL: @test_simplify11(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i64 9, [[TMP1]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %and = and i32 %x, 7
+  %hello_p = getelementptr inbounds [13 x i32], [13 x i32]* @null_hello_mid, i32 0, i32 %and
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  ret i64 %hello_l
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1() {
+; CHECK-LABEL: @test_no_simplify1(
+; CHECK-NEXT:    [[A_L:%.*]] = call i64 @wcslen(i32* getelementptr inbounds ([32 x i32], [32 x i32]* @a, i64 0, i64 0))
+; CHECK-NEXT:    ret i64 [[A_L]]
+;
+  %a_p = getelementptr [32 x i32], [32 x i32]* @a, i64 0, i64 0
+  %a_l = call i64 @wcslen(i32* %a_p)
+  ret i64 %a_l
+}
+
+; wcslen(@null_hello + x) should not be simplified to a sub instruction.
+
+define i64 @test_no_simplify2(i32 %x) {
+; CHECK-LABEL: @test_no_simplify2(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [7 x i32], [7 x i32]* @null_hello, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(i32* [[HELLO_P]])
+; CHECK-NEXT:    ret i64 [[HELLO_L]]
+;
+  %hello_p = getelementptr inbounds [7 x i32], [7 x i32]* @null_hello, i32 0, i32 %x
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  ret i64 %hello_l
+}
+
+; wcslen(@null_hello_mid + (x & 15)) should not be simplified to a sub instruction.
+
+define i64 @test_no_simplify3(i32 %x) {
+; CHECK-LABEL: @test_no_simplify3(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 15
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [13 x i32], [13 x i32]* @null_hello_mid, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(i32* [[HELLO_P]])
+; CHECK-NEXT:    ret i64 [[HELLO_L]]
+;
+  %and = and i32 %x, 15
+  %hello_p = getelementptr inbounds [13 x i32], [13 x i32]* @null_hello_mid, i32 0, i32 %and
+  %hello_l = call i64 @wcslen(i32* %hello_p)
+  ret i64 %hello_l
+}
+
+@str16 = constant [1 x i16] [i16 0]
+
+define i64 @test_no_simplify4() {
+; CHECK-LABEL: @test_no_simplify4(
+; CHECK-NEXT:    [[L:%.*]] = call i64 @wcslen(i32* bitcast ([1 x i16]* @str16 to i32*))
+; CHECK-NEXT:    ret i64 [[L]]
+;
+  %l = call i64 @wcslen(i32* bitcast ([1 x i16]* @str16 to i32*))
+  ret i64 %l
+}
diff --git a/test/Transforms/InstCombine/wcslen-2.ll b/test/Transforms/InstCombine/wcslen-2.ll
new file mode 100644
index 000000000000..c1a70312a2b3
--- /dev/null
+++ b/test/Transforms/InstCombine/wcslen-2.ll
@@ -0,0 +1,18 @@
+; Test that the wcslen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@hello = constant [6 x i32] [i32 104, i32 101, i32 108, i32 108, i32 111, i32 0]
+
+declare i64 @wcslen(i32*, i32)
+
+define i64 @test_no_simplify1() {
+; CHECK-LABEL: @test_no_simplify1(
+  %hello_p = getelementptr [6 x i32], [6 x i32]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i32* %hello_p, i32 187)
+; CHECK-NEXT: %hello_l = call i64 @wcslen
+  ret i64 %hello_l
+; CHECK-NEXT: ret i64 %hello_l
+}
diff --git a/test/Transforms/InstCombine/wcslen-3.ll b/test/Transforms/InstCombine/wcslen-3.ll
new file mode 100644
index 000000000000..c766ff21412d
--- /dev/null
+++ b/test/Transforms/InstCombine/wcslen-3.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test that the wcslen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Test behavior for wchar_size==2
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"wchar_size", i32 2}
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @wcslen(i16*)
+
+@hello = constant [6 x i16] [i16 104, i16 101, i16 108, i16 108, i16 111, i16 0]
+@longer = constant [7 x i16] [i16 108, i16 111, i16 110, i16 103, i16 101, i16 114, i16 0]
+@null = constant [1 x i16] zeroinitializer
+@null_hello = constant [7 x i16] [i16 0, i16 104, i16 101, i16 108, i16 108, i16 111, i16 0]
+@nullstring = constant i16 0
+@a = common global [32 x i16] zeroinitializer, align 1
+@null_hello_mid = constant [13 x i16] [i16 104, i16 101, i16 108, i16 108, i16 111, i16 32, i16 119, i16 111, i16 114, i16 0, i16 108, i16 100, i16 0]
+
+define i64 @test_simplify1() {
+; CHECK-LABEL: @test_simplify1(
+; CHECK-NEXT:    ret i64 5
+;
+  %hello_p = getelementptr [6 x i16], [6 x i16]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  ret i64 %hello_l
+}
+
+define i64 @test_simplify2() {
+; CHECK-LABEL: @test_simplify2(
+; CHECK-NEXT:    ret i64 0
+;
+  %null_p = getelementptr [1 x i16], [1 x i16]* @null, i64 0, i64 0
+  %null_l = call i64 @wcslen(i16* %null_p)
+  ret i64 %null_l
+}
+
+define i64 @test_simplify3() {
+; CHECK-LABEL: @test_simplify3(
+; CHECK-NEXT:    ret i64 0
+;
+  %null_hello_p = getelementptr [7 x i16], [7 x i16]* @null_hello, i64 0, i64 0
+  %null_hello_l = call i64 @wcslen(i16* %null_hello_p)
+  ret i64 %null_hello_l
+}
+
+define i64 @test_simplify4() {
+; CHECK-LABEL: @test_simplify4(
+; CHECK-NEXT:    ret i64 0
+;
+  %len = tail call i64 @wcslen(i16* @nullstring) nounwind
+  ret i64 %len
+}
+
+; Check wcslen(x) == 0 --> *x == 0.
+
+define i1 @test_simplify5() {
+; CHECK-LABEL: @test_simplify5(
+; CHECK-NEXT:    ret i1 false
+;
+  %hello_p = getelementptr [6 x i16], [6 x i16]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  %eq_hello = icmp eq i64 %hello_l, 0
+  ret i1 %eq_hello
+}
+
+define i1 @test_simplify6(i16* %str_p) {
+; CHECK-LABEL: @test_simplify6(
+; CHECK-NEXT:    [[STRLENFIRST:%.*]] = load i16, i16* [[STR_P:%.*]], align 2
+; CHECK-NEXT:    [[EQ_NULL:%.*]] = icmp eq i16 [[STRLENFIRST]], 0
+; CHECK-NEXT:    ret i1 [[EQ_NULL]]
+;
+  %str_l = call i64 @wcslen(i16* %str_p)
+  %eq_null = icmp eq i64 %str_l, 0
+  ret i1 %eq_null
+}
+
+; Check wcslen(x) != 0 --> *x != 0.
+
+define i1 @test_simplify7() {
+; CHECK-LABEL: @test_simplify7(
+; CHECK-NEXT:    ret i1 true
+;
+  %hello_p = getelementptr [6 x i16], [6 x i16]* @hello, i64 0, i64 0
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  %ne_hello = icmp ne i64 %hello_l, 0
+  ret i1 %ne_hello
+}
+
+define i1 @test_simplify8(i16* %str_p) {
+; CHECK-LABEL: @test_simplify8(
+; CHECK-NEXT:    [[STRLENFIRST:%.*]] = load i16, i16* [[STR_P:%.*]], align 2
+; CHECK-NEXT:    [[NE_NULL:%.*]] = icmp ne i16 [[STRLENFIRST]], 0
+; CHECK-NEXT:    ret i1 [[NE_NULL]]
+;
+  %str_l = call i64 @wcslen(i16* %str_p)
+  %ne_null = icmp ne i64 %str_l, 0
+  ret i1 %ne_null
+}
+
+define i64 @test_simplify9(i1 %x) {
+; CHECK-LABEL: @test_simplify9(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i64 5, i64 6
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %hello = getelementptr [6 x i16], [6 x i16]* @hello, i64 0, i64 0
+  %longer = getelementptr [7 x i16], [7 x i16]* @longer, i64 0, i64 0
+  %s = select i1 %x, i16* %hello, i16* %longer
+  %l = call i64 @wcslen(i16* %s)
+  ret i64 %l
+}
+
+; Check the case that should be simplified to a sub instruction.
+; wcslen(@hello + x) --> 5 - x
+
+define i64 @test_simplify10(i16 %x) {
+; CHECK-LABEL: @test_simplify10(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i64 5, [[TMP1]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %hello_p = getelementptr inbounds [6 x i16], [6 x i16]* @hello, i16 0, i16 %x
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  ret i64 %hello_l
+}
+
+; wcslen(@null_hello_mid + (x & 7)) --> 9 - (x & 7)
+
+define i64 @test_simplify11(i16 %x) {
+; CHECK-LABEL: @test_simplify11(
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X:%.*]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[AND]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i64 9, [[TMP1]]
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %and = and i16 %x, 7
+  %hello_p = getelementptr inbounds [13 x i16], [13 x i16]* @null_hello_mid, i16 0, i16 %and
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  ret i64 %hello_l
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1() {
+; CHECK-LABEL: @test_no_simplify1(
+; CHECK-NEXT:    [[A_L:%.*]] = call i64 @wcslen(i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a, i64 0, i64 0))
+; CHECK-NEXT:    ret i64 [[A_L]]
+;
+  %a_p = getelementptr [32 x i16], [32 x i16]* @a, i64 0, i64 0
+  %a_l = call i64 @wcslen(i16* %a_p)
+  ret i64 %a_l
+}
+
+; wcslen(@null_hello + x) should not be simplified to a sub instruction.
+
+define i64 @test_no_simplify2(i16 %x) {
+; CHECK-LABEL: @test_no_simplify2(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i64
+; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [7 x i16], [7 x i16]* @null_hello, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(i16* [[HELLO_P]])
+; CHECK-NEXT:    ret i64 [[HELLO_L]]
+;
+  %hello_p = getelementptr inbounds [7 x i16], [7 x i16]* @null_hello, i16 0, i16 %x
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  ret i64 %hello_l
+}
+
+; wcslen(@null_hello_mid + (x & 15)) should not be simplified to a sub instruction.
+
+define i64 @test_no_simplify3(i16 %x) {
+; CHECK-LABEL: @test_no_simplify3(
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X:%.*]], 15
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[AND]] to i64
+; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* @null_hello_mid, i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(i16* [[HELLO_P]])
+; CHECK-NEXT:    ret i64 [[HELLO_L]]
+;
+  %and = and i16 %x, 15
+  %hello_p = getelementptr inbounds [13 x i16], [13 x i16]* @null_hello_mid, i16 0, i16 %and
+  %hello_l = call i64 @wcslen(i16* %hello_p)
+  ret i64 %hello_l
+}
+
+@str32 = constant [1 x i32] [i32 0]
+
+; This could in principle be simplified, but the current implementation bails on
+; type mismatches.
+define i64 @test_no_simplify4() {
+; CHECK-LABEL: @test_no_simplify4(
+; CHECK-NEXT:    [[L:%.*]] = call i64 @wcslen(i16* bitcast ([1 x i32]* @str32 to i16*))
+; CHECK-NEXT:    ret i64 [[L]]
+;
+  %l = call i64 @wcslen(i16* bitcast ([1 x i32]* @str32 to i16*))
+  ret i64 %l
+}
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index a9b4e4e5cfcc..a027c7e18280 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -735,6 +735,74 @@ define i32 @test54(i32 %a, i32 %b) {
   ret i32 %or
 }
 
+; (A & B) | ~(A ^ B) -> ~(A ^ B)
+
+define i32 @test55(i32 %a, i32 %b) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[AND]], [[XNOR]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %a, %b
+  %xor = xor i32 %a, %b
+  %xnor = xor i32 %xor, -1
+  %or = or i32 %and, %xnor
+  ret i32 %or
+}
+
+; ~(A ^ B) | (A & B) -> ~(A ^ B)
+
+define i32 @test56(i32 %a, i32 %b) {
+; CHECK-LABEL: @test56(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XNOR]], [[AND]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %a, %b
+  %xor = xor i32 %a, %b
+  %xnor = xor i32 %xor, -1
+  %or = or i32 %xnor, %and
+  ret i32 %or
+}
+
+; (B & A) | ~(A ^ B) -> ~(A ^ B)
+
+define i32 @test57(i32 %a, i32 %b) {
+; CHECK-LABEL: @test57(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[AND]], [[XNOR]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %b, %a
+  %xor = xor i32 %a, %b
+  %xnor = xor i32 %xor, -1
+  %or = or i32 %and, %xnor
+  ret i32 %or
+}
+
+; ~(A ^ B) | (A & B) -> ~(A ^ B)
+
+define i32 @test58(i32 %a, i32 %b) {
+; CHECK-LABEL: @test58(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], [[B]]
+; CHECK-NEXT:    [[XNOR:%.*]] = xor i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XNOR]], [[AND]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %b, %a
+  %xor = xor i32 %a, %b
+  %xnor = xor i32 %xor, -1
+  %or = or i32 %xnor, %and
+  ret i32 %or
+}
+
 define i8 @lshr_perfect_mask(i8 %x) {
 ; CHECK-LABEL: @lshr_perfect_mask(
 ; CHECK-NEXT:    [[SH:%.*]] = lshr i8 %x, 5
@@ -797,3 +865,11 @@ define <2 x i8> @shl_undersized_mask_splat(<2 x i8> %x) {
   ret <2 x i8> %mask
 }
 
+define i32 @reversed_not(i32 %a) {
+; CHECK-LABEL: @reversed_not(
+; CHECK-NEXT:    ret i32 -1
+;
+  %nega = xor i32 -1, %a
+  %or = or i32 %a, %nega
+  ret i32 %or
+}
diff --git a/test/Transforms/InstSimplify/icmp-bool-constant.ll b/test/Transforms/InstSimplify/icmp-bool-constant.ll
new file mode 100644
index 000000000000..f711fae0a857
--- /dev/null
+++ b/test/Transforms/InstSimplify/icmp-bool-constant.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; Test all integer predicates with bool types and true/false constants.
+; Use vectors to provide test coverage that is not duplicated in other folds.
+
+define <2 x i1> @eq_t(<2 x i1> %a) {
+; CHECK-LABEL: @eq_t(
+; CHECK-NEXT:    ret <2 x i1> %a
+;
+  %r = icmp eq <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @eq_f(<2 x i1> %a) {
+; CHECK-LABEL: @eq_f(
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i1> %a, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = icmp eq <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ne_t(<2 x i1> %a) {
+; CHECK-LABEL: @ne_t(
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i1> %a, <i1 true, i1 true>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = icmp ne <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ne_f(<2 x i1> %a) {
+; CHECK-LABEL: @ne_f(
+; CHECK-NEXT:    ret <2 x i1> %a
+;
+  %r = icmp ne <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ugt_t(<2 x i1> %a) {
+; CHECK-LABEL: @ugt_t(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %r = icmp ugt <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ugt_f(<2 x i1> %a) {
+; CHECK-LABEL: @ugt_f(
+; CHECK-NEXT:    ret <2 x i1> %a
+;
+  %r = icmp ugt <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ult_t(<2 x i1> %a) {
+; CHECK-LABEL: @ult_t(
+; CHECK-NEXT:    [[R:%.*]] = icmp ult <2 x i1> %a, <i1 true, i1 true>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = icmp ult <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ult_f(<2 x i1> %a) {
+; CHECK-LABEL: @ult_f(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %r = icmp ult <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @sgt_t(<2 x i1> %a) {
+; CHECK-LABEL: @sgt_t(
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt <2 x i1> %a, <i1 true, i1 true>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = icmp sgt <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @sgt_f(<2 x i1> %a) {
+; CHECK-LABEL: @sgt_f(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %r = icmp sgt <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @slt_t(<2 x i1> %a) {
+; CHECK-LABEL: @slt_t(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %r = icmp slt <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @slt_f(<2 x i1> %a) {
+; CHECK-LABEL: @slt_f(
+; CHECK-NEXT:    ret <2 x i1> %a
+;
+  %r = icmp slt <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @uge_t(<2 x i1> %a) {
+; CHECK-LABEL: @uge_t(
+; CHECK-NEXT:    ret <2 x i1> %a
+;
+  %r = icmp uge <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @uge_f(<2 x i1> %a) {
+; CHECK-LABEL: @uge_f(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %r = icmp uge <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ule_t(<2 x i1> %a) {
+; CHECK-LABEL: @ule_t(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %r = icmp ule <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @ule_f(<2 x i1> %a) {
+; CHECK-LABEL: @ule_f(
+; CHECK-NEXT:    [[R:%.*]] = icmp ule <2 x i1> %a, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = icmp ule <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @sge_t(<2 x i1> %a) {
+; CHECK-LABEL: @sge_t(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %r = icmp sge <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @sge_f(<2 x i1> %a) {
+; CHECK-LABEL: @sge_f(
+; CHECK-NEXT:    [[R:%.*]] = icmp sge <2 x i1> %a, zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = icmp sge <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @sle_t(<2 x i1> %a) {
+; CHECK-LABEL: @sle_t(
+; CHECK-NEXT:    ret <2 x i1> %a
+;
+  %r = icmp sle <2 x i1> %a, <i1 true, i1 true>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @sle_f(<2 x i1> %a) {
+; CHECK-LABEL: @sle_f(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %r = icmp sle <2 x i1> %a, <i1 false, i1 false>
+  ret <2 x i1> %r
+}
+
diff --git a/test/Transforms/JumpThreading/assume.ll b/test/Transforms/JumpThreading/assume.ll
index 53010b71c728..3a039676e172 100644
--- a/test/Transforms/JumpThreading/assume.ll
+++ b/test/Transforms/JumpThreading/assume.ll
@@ -56,6 +56,50 @@ return:                                           ; preds = %entry, %if.then
   ret i32 %retval.0
 }
 
+@g = external global i32
+
+; Check that we do prove a fact using an assume within the block.
+; FIXME: We can fold the assume based on the semantics of assume.
+; CHECK-LABEL: @can_fold_assume
+; CHECK: %notnull = icmp ne i32* %array, null
+; CHECK-NEXT: call void @llvm.assume(i1 %notnull)
+; CHECK-NEXT: ret void
+define void @can_fold_assume(i32* %array) {
+  %notnull = icmp ne i32* %array, null
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+declare void @f(i1)
+declare void @exit()
+; We can fold the assume but not the uses before the assume.
+define void @dont_fold_incorrectly(i32* %array) {
+; CHECK-LABEL:@dont_fold_incorrectly
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: exit()
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
+  call void @exit()
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
 ; Function Attrs: nounwind
 declare void @llvm.assume(i1) #1
 
diff --git a/test/Transforms/JumpThreading/fold-not-thread.ll b/test/Transforms/JumpThreading/fold-not-thread.ll
index 06ddc10e02b6..f05169b31bc8 100644
--- a/test/Transforms/JumpThreading/fold-not-thread.ll
+++ b/test/Transforms/JumpThreading/fold-not-thread.ll
@@ -133,10 +133,10 @@ L3:
   ret void
 }
 
-; Make sure we can do the RAUW for %add...
+; FIXME: Make sure we can do the RAUW for %add...
 ;
 ; CHECK-LABEL: @rauw_if_possible(
-; CHECK: call void @f4(i32 96) 
+; CHECK: call void @f4(i32 %add)
 define void @rauw_if_possible(i32 %value) nounwind {
 entry:
   %cmp = icmp eq i32 %value, 32
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
index eac2b5dcd85f..c5f72b113efc 100644
--- a/test/Transforms/JumpThreading/guards.ll
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -181,3 +181,97 @@ Exit:
 ; CHECK-NEXT:    ret void
   ret void
 }
+
+declare void @never_called()
+
+; Assume the guard is always taken and we deoptimize, so we never reach the
+; branch below that guard. We should *never* change the behaviour of a guard from
+; `must deoptimize` to `may deoptimize`, since this affects the program
+; semantics.
+define void @dont_fold_guard(i8* %addr, i32 %i, i32 %length) {
+; CHECK-LABEL: dont_fold_guard
+; CHECK: experimental.guard(i1 %wide.chk)
+
+entry:
+  br label %BBPred
+
+BBPred:
+ %cond = icmp eq i8* %addr, null
+ br i1 %cond, label %zero, label %not_zero
+
+zero:
+  unreachable
+
+not_zero:
+  %c1 = icmp ult i32 %i, %length
+  %c2 = icmp eq i32 %i, 0
+  %wide.chk = and i1 %c1, %c2
+  call void(i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+  br i1 %c2, label %unreachedBB2, label %unreachedBB1
+
+unreachedBB2:
+  call void @never_called()
+  ret void
+
+unreachedBB1:
+  ret void
+}
+
+
+; same as dont_fold_guard1 but condition %cmp is not an instruction.
+; We cannot fold the guard under any circumstance.
+; FIXME: We can merge unreachableBB2 into not_zero.
+define void @dont_fold_guard2(i8* %addr, i1 %cmp, i32 %i, i32 %length) {
+; CHECK-LABEL: dont_fold_guard2
+; CHECK: guard(i1 %cmp)
+
+entry:
+  br label %BBPred
+
+BBPred:
+ %cond = icmp eq i8* %addr, null
+ br i1 %cond, label %zero, label %not_zero
+
+zero:
+  unreachable
+
+not_zero:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp) [ "deopt"() ]
+  br i1 %cmp, label %unreachedBB2, label %unreachedBB1
+
+unreachedBB2:
+  call void @never_called()
+  ret void
+
+unreachedBB1:
+  ret void
+}
+
+; Same as dont_fold_guard1 but use switch instead of branch.
+; triggers source code `ProcessThreadableEdges`.
+declare void @f(i1)
+define void @dont_fold_guard3(i1 %cmp1, i32 %i) nounwind {
+; CHECK-LABEL: dont_fold_guard3 
+; CHECK-LABEL: L2:
+; CHECK-NEXT: %cmp = icmp eq i32 %i, 0 
+; CHECK-NEXT: guard(i1 %cmp)
+; CHECK-NEXT: @f(i1 %cmp)
+; CHECK-NEXT: ret void
+entry:
+  br i1 %cmp1, label %L0, label %L3 
+L0:
+  %cmp = icmp eq i32 %i, 0
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp) [ "deopt"() ]
+  switch i1 %cmp, label %L3 [
+    i1 false, label %L1
+    i1 true, label %L2
+    ]
+
+L1:
+  ret void
+L2:
+  call void @f(i1 %cmp)
+  ret void
+L3:
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/canonical-2.ll b/test/Transforms/LoopStrengthReduce/X86/canonical-2.ll
new file mode 100644
index 000000000000..69bae3a51159
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/canonical-2.ll
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -S < %s
+; PR33077. Check the LSR Use formula to be inserted is already canonicalized and
+; will not trigger assertion.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: uwtable
+define void @foo() { 
+cHeapLvb.exit:
+  br label %not_zero48.us
+
+not_zero48.us:                                    ; preds = %not_zero48.us, %cHeapLvb.exit
+  %indvars.iv.us = phi i64 [ %indvars.iv.next.us.7, %not_zero48.us ], [ undef, %cHeapLvb.exit ]
+  %0 = phi i32 [ %13, %not_zero48.us ], [ undef, %cHeapLvb.exit ]
+  %indvars.iv.next.us = add nuw nsw i64 %indvars.iv.us, 1
+  %1 = add i32 %0, 2
+  %2 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %indvars.iv.next.us
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %4 = add i32 %0, 3
+  %5 = load i32, i32 addrspace(1)* undef, align 4
+  %6 = sub i32 undef, %5
+  %factor.us.2 = shl i32 %6, 1
+  %7 = add i32 %factor.us.2, %1
+  %8 = load i32, i32 addrspace(1)* undef, align 4
+  %9 = sub i32 %7, %8
+  %factor.us.3 = shl i32 %9, 1
+  %10 = add i32 %factor.us.3, %4
+  %11 = load i32, i32 addrspace(1)* undef, align 4
+  %12 = sub i32 %10, %11
+  %factor.us.4 = shl i32 %12, 1
+  %13 = add i32 %0, 8
+  %indvars.iv.next.us.7 = add nsw i64 %indvars.iv.us, 8
+  br label %not_zero48.us
+}
+
diff --git a/test/Transforms/NewGVN/completeness.ll b/test/Transforms/NewGVN/completeness.ll
new file mode 100644
index 000000000000..bafe5f966d22
--- /dev/null
+++ b/test/Transforms/NewGVN/completeness.ll
@@ -0,0 +1,415 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @test1(i32, i8**) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:         br label [[TMP6:%.*]]
+; CHECK:         br label [[TMP6]]
+; CHECK:         [[TMP7:%.*]] = phi i32 [ 75, [[TMP4]] ], [ 105, [[TMP5]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 5, [[TMP4]] ], [ 7, [[TMP5]] ]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %3 = icmp ne i32 %0, 0
+  br i1 %3, label %4, label %5
+
+; <label>:4:                                      ; preds = %2
+  br label %6
+
+; <label>:5:                                      ; preds = %2
+  br label %6
+
+; <label>:6:                                      ; preds = %5, %4
+  %.0 = phi i32 [ 5, %4 ], [ 7, %5 ]
+  %7 = mul nsw i32 %.0, 15
+  ret i32 %7
+}
+
+define i32 @test2(i32) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; CHECK:         br label [[TMP5:%.*]]
+; CHECK:         br label [[TMP5]]
+; CHECK:         [[DOT01:%.*]] = phi i32 [ 3, [[TMP3]] ], [ 2, [[TMP4]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 2, [[TMP3]] ], [ 3, [[TMP4]] ]
+; CHECK-NEXT:    ret i32 5
+;
+  %2 = icmp ne i32 %0, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3:                                      ; preds = %1
+  br label %5
+
+; <label>:4:                                      ; preds = %1
+  br label %5
+
+; <label>:5:                                      ; preds = %4, %3
+  %.01 = phi i32 [ 3, %3 ], [ 2, %4 ]
+  %.0 = phi i32 [ 2, %3 ], [ 3, %4 ]
+  %6 = add nsw i32 %.01, %.0
+  ret i32 %6
+}
+define i32 @test3(i1 %which) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ -877, [[ENTRY:%.*]] ], [ 113, [[DELAY]] ]
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1000, [[ENTRY]] ], [ 10, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = sub i32 123, %A
+  ret i32 %value
+}
+
+define <2 x i32> @test3vec(i1 %which) {
+; CHECK-LABEL: @test3vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ <i32 -877, i32 -877>, [[ENTRY:%.*]] ], [ <i32 113, i32 113>, [[DELAY]] ]
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1000, i32 1000>, [[ENTRY]] ], [ <i32 10, i32 10>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+;
+
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = sub <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test3vec2(i1 %which) {
+; CHECK-LABEL: @test3vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ <i32 -877, i32 -2167>, [[ENTRY:%.*]] ], [ <i32 113, i32 303>, [[DELAY]] ]
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1000, i32 2500>, [[ENTRY]] ], [ <i32 10, i32 30>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+;
+
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = sub <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %value
+}
+
+;; This example is a bit contrived because we can't create fake memoryuses, so we use two loads in the if blocks
+define i32 @test4(i32, i8**, i32* noalias, i32* noalias) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    store i32 5, i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 7, i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:         br label [[TMP8:%.*]]
+; CHECK:         br label [[TMP8]]
+; CHECK:         [[DOT01:%.*]] = phi i32 [ 5, [[TMP6]] ], [ 7, [[TMP7]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32* [ [[TMP2]], [[TMP6]] ], [ [[TMP3]], [[TMP7]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[DOT0]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[TMP9]], 15
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw i32 [[TMP10]], [[DOT01]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  store i32 5, i32* %2, align 4
+  store i32 7, i32* %3, align 4
+  %5 = icmp ne i32 %0, 0
+  br i1 %5, label %6, label %8
+
+; <label>:6:                                      ; preds = %4
+  %7 = load i32, i32* %2, align 4
+  br label %10
+
+; <label>:8:                                      ; preds = %4
+  %9 = load i32, i32* %3, align 4
+  br label %10
+
+; <label>:10:                                     ; preds = %8, %6
+  %.01 = phi i32 [ %7, %6 ], [ %9, %8 ]
+  %.0 = phi i32* [ %2, %6 ], [ %3, %8 ]
+  %11 = load i32, i32* %.0, align 4
+  %12 = mul nsw i32 %11, 15
+  %13 = mul nsw i32 %12, %.01
+  ret i32 %13
+}
+
+@global = common global [100 x i64] zeroinitializer, align 16
+@global.1 = common global [100 x i64] zeroinitializer, align 16
+define i64 @test5(i64 %arg) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[ARG:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB28:%.*]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br label [[BB5:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP9:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[BB27:%.*]], label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i64 [ [[ARG]], [[BB2]] ], [ [[TMP9]], [[BB5]] ]
+; CHECK-NEXT:    [[TMP9]] = add nsw i64 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @global, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @global.1, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw i64 [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[BB5]], label [[BB14:%.*]]
+; CHECK:       bb14:
+; CHECK-NEXT:    br label [[BB15:%.*]]
+; CHECK:       bb15:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ [[TMP25:%.*]], [[BB15]] ], [ [[TMP12]], [[BB14]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i64 [ [[TMP24:%.*]], [[BB15]] ], [ [[TMP11]], [[BB14]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i64 [ [[TMP22:%.*]], [[BB15]] ], [ [[TMP10]], [[BB14]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i64 [ [[TMP20:%.*]], [[BB15]] ], [ 0, [[BB14]] ]
+; CHECK-NEXT:    store i64 [[TMP0]], i64* [[TMP]], align 8
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [100 x i64], [100 x i64]* @global, i64 0, i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22]] = load i64, i64* [[TMP21]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [100 x i64], [100 x i64]* @global.1, i64 0, i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP24]] = load i64, i64* [[TMP23]], align 8
+; CHECK-NEXT:    [[TMP25]] = mul nsw i64 [[TMP24]], [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP20]], [[TMP25]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[BB4:%.*]], label [[BB15]]
+; CHECK:       bb27:
+; CHECK-NEXT:    br label [[BB28]]
+; CHECK:       bb28:
+; CHECK-NEXT:    ret i64 0
+;
+bb:
+  %tmp = alloca i64, align 8
+  %tmp1 = icmp eq i64 %arg, 0
+  br i1 %tmp1, label %bb28, label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp3 = bitcast i64* %tmp to i8*
+  br label %bb7
+
+bb4:                                              ; preds = %bb15
+  br label %bb5
+
+bb5:                                              ; preds = %bb7, %bb4
+  %tmp6 = icmp eq i64 %tmp9, 0
+  br i1 %tmp6, label %bb27, label %bb7
+
+bb7:                                              ; preds = %bb5, %bb2
+  %tmp8 = phi i64 [ %arg, %bb2 ], [ %tmp9, %bb5 ]
+  %tmp9 = add nsw i64 %tmp8, -1
+  %tmp10 = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @global, i64 0, i64 0), align 16
+  %tmp11 = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @global.1, i64 0, i64 0), align 16
+  %tmp12 = mul nsw i64 %tmp11, %tmp10
+  %tmp13 = icmp eq i64 %tmp12, 0
+  br i1 %tmp13, label %bb5, label %bb14
+
+bb14:                                             ; preds = %bb7
+  br label %bb15
+
+bb15:                                             ; preds = %bb15, %bb14
+  %tmp16 = phi i64 [ %tmp24, %bb15 ], [ %tmp11, %bb14 ]
+  %tmp17 = phi i64 [ %tmp22, %bb15 ], [ %tmp10, %bb14 ]
+  %tmp18 = phi i64 [ %tmp20, %bb15 ], [ 0, %bb14 ]
+;; This multiply is an op of phis which is really equivalent to phi(tmp25, tmp12)
+  %tmp19 = mul nsw i64 %tmp16, %tmp17
+  store i64 %tmp19, i64* %tmp, align 8
+  %tmp20 = add nuw nsw i64 %tmp18, 1
+  %tmp21 = getelementptr inbounds [100 x i64], [100 x i64]* @global, i64 0, i64 %tmp20
+  %tmp22 = load i64, i64* %tmp21, align 8
+  %tmp23 = getelementptr inbounds [100 x i64], [100 x i64]* @global.1, i64 0, i64 %tmp20
+  %tmp24 = load i64, i64* %tmp23, align 8
+  %tmp25 = mul nsw i64 %tmp24, %tmp22
+  %tmp26 = icmp eq i64 %tmp20, %tmp25
+  br i1 %tmp26, label %bb4, label %bb15
+
+bb27:                                             ; preds = %bb5
+  br label %bb28
+
+bb28:                                             ; preds = %bb27, %bb
+  ret i64 0
+}
+
+;; These icmps are all equivalent to phis of constants
+define i8 @test6(i8* %addr) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry-block:
+; CHECK-NEXT:    br label %main-loop
+; CHECK:       main-loop:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ true, %entry-block ], [ false, [[CORE:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %entry-block ], [ true, [[CORE]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ 0, %entry-block ], [ 1, [[CORE]] ]
+; CHECK-NEXT:    store volatile i8 0, i8* [[ADDR:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label %busy-wait-phi-0, label [[EXIT:%.*]]
+; CHECK:       busy-wait-phi-0:
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i8, i8* [[ADDR]]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
+; CHECK-NEXT:    br i1 [[ICMP]], label %busy-wait-phi-0, label [[CORE]]
+; CHECK:       core:
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TRAP:%.*]], label %main-loop
+; CHECK:       trap:
+; CHECK-NEXT:    ret i8 1
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+entry-block:
+  br label %main-loop
+
+main-loop:
+  %phi = phi i8 [ 0, %entry-block ], [ 1, %core ]
+  %switch_0 = icmp eq i8 %phi, 0
+  store volatile i8 0, i8* %addr
+  br i1 %switch_0, label %busy-wait-phi-0, label %exit
+
+busy-wait-phi-0:
+  %load = load volatile i8, i8* %addr
+  %icmp = icmp eq i8 %load, 0
+  br i1 %icmp, label %busy-wait-phi-0, label %core
+
+core:
+  %switch_1 = icmp eq i8 %phi, 1
+  br i1 %switch_1, label %trap, label %main-loop
+
+trap:
+  ret i8 1
+
+exit:
+  ret i8 0
+}
+
+; Test that we don't infinite loop simplifying
+; an undefined value that can go both ways.
+define void @test7() {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB1]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i32 [ undef, %bb ], [ %tmp3, %bb1 ]
+  %tmp2 = icmp eq i32 %tmp, 0
+  %tmp3 = select i1 %tmp2, i32 1, i32 %tmp
+  br label %bb1
+}
+
+
+
+; Test that we get a consistent answer about what the
+; value of this undefined select is.
+define void @test8() {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB1]]
+;
+bb:
+  %tmp = select i1 undef, i8 0, i8 1
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp2 = phi i8 [ %tmp4, %bb1 ], [ %tmp, %bb ]
+  %tmp3 = icmp eq i8 %tmp2, 0
+  %tmp4 = select i1 %tmp3, i8 1, i8 %tmp2
+  br label %bb1
+}
+
+
+;; Make sure we handle the case where we later come up with an expression that we need
+;; for a phi of ops.
+define void @test9() {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ -13, [[BB2]] ], [ [[TMP11:%.*]], [[BB6]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ 1, [[BB2]] ], [ [[TMP8:%.*]], [[BB6]] ]
+; CHECK-NEXT:    [[TMP8]] = add nuw nsw i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP11]] = add i32 -14, [[TMP8]]
+; CHECK-NEXT:    br label [[BB6]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  br i1 undef, label %bb1, label %bb2
+
+bb2:                                              ; preds = %bb1
+  %tmp = select i1 true, i32 -14, i32 -10
+  %tmp3 = add i32 %tmp, 0
+  %tmp4 = select i1 true, i32 -14, i32 -10
+  %tmp5 = add i32 %tmp4, 0
+  br label %bb6
+
+bb6:                                              ; preds = %bb6, %bb2
+  %tmp7 = phi i32 [ 1, %bb2 ], [ %tmp13, %bb6 ]
+  %tmp8 = add nuw nsw i32 %tmp7, 1
+  %tmp9 = add i32 %tmp3, %tmp7
+  %tmp10 = select i1 false, i32 undef, i32 %tmp9
+  %tmp11 = add i32 %tmp5, %tmp8
+  %tmp12 = select i1 undef, i32 undef, i32 %tmp11
+  %tmp13 = add nuw nsw i32 %tmp7, 1
+  br label %bb6
+}
+
+;; Ensure that we revisit predicateinfo operands at the right points in time.
+define void @test10() {
+b:
+  %m = getelementptr i32, i32* null, i64 8
+  br label %g
+
+g:                                                ; preds = %i, %b
+  %n = phi i32* [ %h, %i ], [ null, %b ]
+  %h = getelementptr i32, i32* %n, i64 1
+  %j = icmp eq i32* %h, %m
+  br i1 %j, label %c, label %i
+
+i:                                                ; preds = %g
+  br i1 undef, label %k, label %g
+
+k:                                                ; preds = %i
+  %l = icmp eq i32* %n, %m
+  br i1 %l, label %c, label %o
+
+o:                                                ; preds = %k
+  br label %c
+
+c:                                                ; preds = %o, %k, %g
+  %0 = phi i32* [ undef, %o ], [ %m, %k ], [ %m, %g ]
+  ret void
+}
diff --git a/test/Transforms/NewGVN/pr32838.ll b/test/Transforms/NewGVN/pr32838.ll
new file mode 100644
index 000000000000..b6b7b0d19b86
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32838.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt -newgvn -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+;; Ensure we don't infinite loop when all phi arguments are really unreachable or self-defined
+define void @fn1(i64 %arg) {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[COND_TRUE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
+; CHECK:       firstphiblock:
+; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK:%.*]]
+; CHECK:       secondphiblock:
+; CHECK-NEXT:    [[SECONDPHI:%.*]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ undef, [[FIRSTPHIBLOCK]] ]
+; CHECK-NEXT:    br i1 undef, label [[FIRSTPHIBLOCK]], label [[THIRDPHIBLOCK]]
+; CHECK:       thirdphiblock:
+; CHECK-NEXT:    [[THIRDPHI]] = phi i64 [ [[SECONDPHI]], [[SECONDPHIBLOCK]] ], [ [[DIV:%.*]], [[COND_TRUE]] ]
+; CHECK-NEXT:    br label [[SECONDPHIBLOCK]]
+; CHECK:       for.cond17thread-pre-split:
+; CHECK-NEXT:    br label [[COND_TRUE]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[DIV]] = sdiv i64 [[ARG:%.*]], 4
+; CHECK-NEXT:    br label [[THIRDPHIBLOCK]]
+; CHECK:       temp:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %if.then, label %cond.true
+if.then:
+  br i1 false, label %firstphiblock, label %temp
+firstphiblock:
+  %firstphi = phi i64 [ %arg, %if.then ], [ undef, %secondphiblock ]
+  br i1 undef, label %for.cond17thread-pre-split, label %secondphiblock
+secondphiblock:
+  %secondphi = phi i64 [ %thirdphi, %thirdphiblock ], [ %firstphi, %firstphiblock ]
+  br i1 undef, label %firstphiblock, label %thirdphiblock
+thirdphiblock:
+  %thirdphi = phi i64 [ %secondphi, %secondphiblock ], [ %div, %cond.true ]
+  br label %secondphiblock
+for.cond17thread-pre-split:
+  br label %cond.true
+cond.true:
+  %fourthphi = phi i64 [ %arg, %entry ], [ %firstphi, %for.cond17thread-pre-split ]
+  %div = sdiv i64 %fourthphi, 4
+  br label %thirdphiblock
+temp:
+  ret void
+}
+define void @fn2(i64 %arg) {
+; CHECK-LABEL: @fn2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[COND_TRUE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
+; CHECK:       firstphiblock:
+; CHECK-NEXT:    [[FIRSTPHI:%.*]] = phi i64 [ undef, [[IF_THEN]] ], [ [[SECONDPHI:%.*]], [[SECONDPHIBLOCK:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK]]
+; CHECK:       secondphiblock:
+; CHECK-NEXT:    [[SECONDPHI]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ [[FIRSTPHI]], [[FIRSTPHIBLOCK]] ]
+; CHECK-NEXT:    br i1 undef, label [[FIRSTPHIBLOCK]], label [[THIRDPHIBLOCK]]
+; CHECK:       thirdphiblock:
+; CHECK-NEXT:    [[THIRDPHI]] = phi i64 [ [[SECONDPHI]], [[SECONDPHIBLOCK]] ], [ [[DIV:%.*]], [[COND_TRUE]] ]
+; CHECK-NEXT:    br label [[SECONDPHIBLOCK]]
+; CHECK:       for.cond17thread-pre-split:
+; CHECK-NEXT:    br label [[COND_TRUE]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[FOURTHPHI:%.*]] = phi i64 [ [[ARG:%.*]], [[ENTRY:%.*]] ], [ [[FIRSTPHI]], %for.cond17thread-pre-split ]
+; CHECK-NEXT:    [[DIV]] = sdiv i64 [[FOURTHPHI]], 4
+; CHECK-NEXT:    br label [[THIRDPHIBLOCK]]
+; CHECK:       temp:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %if.then, label %cond.true
+if.then:
+  br i1 false, label %firstphiblock, label %temp
+firstphiblock:
+  %firstphi = phi i64 [ %arg, %if.then ], [ %secondphi, %secondphiblock ]
+  br i1 undef, label %for.cond17thread-pre-split, label %secondphiblock
+secondphiblock:
+  %secondphi = phi i64 [ %thirdphi, %thirdphiblock ], [ %firstphi, %firstphiblock ]
+  br i1 undef, label %firstphiblock, label %thirdphiblock
+thirdphiblock:
+  %thirdphi = phi i64 [ %secondphi, %secondphiblock ], [ %div, %cond.true ]
+  br label %secondphiblock
+for.cond17thread-pre-split:
+  br label %cond.true
+cond.true:
+  %fourthphi = phi i64 [ %arg, %entry ], [ %firstphi, %for.cond17thread-pre-split ]
+  %div = sdiv i64 %fourthphi, 4
+  br label %thirdphiblock
+temp:
+  ret void
+}
+@b = external global i32, align 4
+@a = external global i32, align 4
+define void @fn3() {
+; CHECK-LABEL: @fn3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L1:%.*]]
+; CHECK:       l1.loopexit:
+; CHECK-NEXT:    br label [[L1]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[F_0:%.*]] = phi i32* [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond.loopexit:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.cond1:
+; CHECK-NEXT:    br label [[L2:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND1:%.*]], label [[L1_LOOPEXIT]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[G_4:%.*]] = phi i32* [ @b, [[FOR_END14]] ], [ @a, [[FOR_COND1]] ]
+; CHECK-NEXT:    [[F_2:%.*]] = phi i32* [ [[F_0]], [[FOR_END14]] ], [ @a, [[FOR_COND1]] ]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND_LOOPEXIT:%.*]], label [[FOR_INC]]
+; CHECK:       for.end14:
+; CHECK-NEXT:    br label [[L2]]
+;
+entry:
+  br label %l1
+l1.loopexit:
+  %g.223.lcssa = phi i32* [ @b, %for.body3 ]
+  br label %l1
+l1:
+  %g.0 = phi i32* [ undef, %entry ], [ %g.223.lcssa, %l1.loopexit ]
+  %f.0 = phi i32* [ @b, %entry ], [ @a, %l1.loopexit ]
+  br label %for.cond
+for.cond.loopexit:
+  br label %for.cond
+for.cond:
+  %g.1 = phi i32* [ %g.0, %l1 ], [ %g.4, %for.cond.loopexit ]
+  %f.1 = phi i32* [ %f.0, %l1 ], [ %f.2, %for.cond.loopexit ]
+  br i1 undef, label %for.end14, label %for.cond1.preheader
+for.cond1.preheader:
+  br label %for.body3
+for.cond1:
+  br label %l2
+for.body3:
+  br i1 undef, label %for.cond1, label %l1.loopexit
+l2:
+  %g.4 = phi i32* [ %g.1, %for.end14 ], [ @a, %for.cond1 ]
+  %f.2 = phi i32* [ %f.1, %for.end14 ], [ @a, %for.cond1 ]
+  br label %for.inc
+for.inc:
+  br i1 false, label %for.cond.loopexit, label %for.inc
+for.end14:
+  br label %l2
+}
+
diff --git a/test/Transforms/NewGVN/pr32845.ll b/test/Transforms/NewGVN/pr32845.ll
new file mode 100644
index 000000000000..beba3363b303
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32845.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -newgvn %s -S | FileCheck %s
+
+@b = external global i32, align 4
+@a = external global i32, align 4
+define void @tinkywinky() {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L1:%.*]]
+; CHECK:       l1.loopexit:
+; CHECK-NEXT:    br label [[L1]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[F_0:%.*]] = phi i32* [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond.loopexit:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.cond1:
+; CHECK-NEXT:    br label [[L2:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND1:%.*]], label [[L1_LOOPEXIT]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[G_4:%.*]] = phi i32* [ @b, [[FOR_END14]] ], [ @a, [[FOR_COND1]] ]
+; CHECK-NEXT:    [[F_2:%.*]] = phi i32* [ [[F_0]], [[FOR_END14]] ], [ @a, [[FOR_COND1]] ]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND_LOOPEXIT:%.*]], label [[FOR_INC]]
+; CHECK:       for.end14:
+; CHECK-NEXT:    br label [[L2]]
+;
+entry:
+  br label %l1
+l1.loopexit:
+  %g.223.lcssa = phi i32* [ @b, %for.body3 ]
+  br label %l1
+l1:
+  %g.0 = phi i32* [ undef, %entry ], [ %g.223.lcssa, %l1.loopexit ]
+  %f.0 = phi i32* [ @b, %entry ], [ @a, %l1.loopexit ]
+  br label %for.cond
+for.cond.loopexit:
+  br label %for.cond
+for.cond:
+  %g.1 = phi i32* [ %g.0, %l1 ], [ %g.4, %for.cond.loopexit ]
+  %f.1 = phi i32* [ %f.0, %l1 ], [ %f.2, %for.cond.loopexit ]
+  br i1 undef, label %for.end14, label %for.cond1.preheader
+for.cond1.preheader:
+  br label %for.body3
+for.cond1:
+  br label %l2
+for.body3:
+  br i1 undef, label %for.cond1, label %l1.loopexit
+l2:
+  %g.4 = phi i32* [ %g.1, %for.end14 ], [ @a, %for.cond1 ]
+  %f.2 = phi i32* [ %f.1, %for.end14 ], [ @a, %for.cond1 ]
+  br label %for.inc
+for.inc:
+  br i1 false, label %for.cond.loopexit, label %for.inc
+for.end14:
+  br label %l2
+}
diff --git a/test/Transforms/NewGVN/pr32897.ll b/test/Transforms/NewGVN/pr32897.ll
new file mode 100644
index 000000000000..eb19aa367b72
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32897.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -newgvn %s | FileCheck %s
+
+define void @tinkywinky(i64* %b) {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    store i64 undef, i64* [[B:%.*]]
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* [[B]]
+; CHECK-NEXT:    br i1 undef, label [[BODY]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    br label [[BODY]]
+;
+entry:
+  br label %body
+body:
+  %d.1 = phi i64* [ undef, %entry ], [ %d.1, %body ], [ %b, %end ]
+  store i64 undef, i64* %d.1
+  %b2 = load i64, i64* %b
+  %or = or i64 %b2, 0
+  store i64 %or, i64* %b
+  br i1 undef, label %body, label %end
+end:
+  br label %body
+}
diff --git a/test/Transforms/NewGVN/pr32945.ll b/test/Transforms/NewGVN/pr32945.ll
new file mode 100644
index 000000000000..553ba4bd4aaa
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32945.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -newgvn %s | FileCheck %s
+; CHECK-NOT: call i32 @llvm.ssa.copy
+
+@d = external global i32
+@e = external global i32
+define void @tinkywinky() {
+  br i1 true, label %lor.lhs.false, label %cond.true
+lor.lhs.false:
+  %tmp = load i32, i32* @d, align 4
+  %patatino = load i32, i32* null, align 4
+  %or = or i32 %tmp, %patatino
+  store i32 %or, i32* @d, align 4
+  br label %cond.true
+cond.true:
+  %tmp1 = load i32, i32* @e, align 4
+  %tmp2 = load i32, i32* @d, align 4
+  %cmp = icmp eq i32 %tmp1, %tmp2
+  br i1 %cmp, label %cond.true6, label %cond.false
+cond.true6:
+  %cmp7 = icmp slt i32 %tmp1, 0
+  br i1 %cmp7, label %cond.false, label %cond.false
+cond.false:
+  ret void
+}
diff --git a/test/Transforms/NewGVN/pr33014.ll b/test/Transforms/NewGVN/pr33014.ll
new file mode 100644
index 000000000000..4157178e4f0c
--- /dev/null
+++ b/test/Transforms/NewGVN/pr33014.ll
@@ -0,0 +1,54 @@
+; Make sure we don't end up in an infinite recursion in singleReachablePHIPath().
+; REQUIRES: asserts
+; RUN: opt -newgvn -S %s | FileCheck %s
+
+@c = external global i64, align 8
+
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK: entry:
+; CHECK-NEXT:   br i1 undef, label %l2, label %if.then
+; CHECK: if.then:                                          ; preds = %entry
+; CHECK-NEXT:   br label %for.body
+; CHECK: ph:                                               ; preds = %back, %ontrue
+; CHECK-NEXT:   br label %for.body
+; CHECK: for.body:                                         ; preds = %ph, %if.then
+; CHECK-NEXT:   br i1 undef, label %ontrue, label %onfalse
+; CHECK: onfalse:                                          ; preds = %for.body
+; CHECK-NEXT:   %patatino = load i64, i64* @c
+; CHECK-NEXT:   ret void
+; CHECK: ontrue:                                           ; preds = %for.body
+; CHECK-NEXT:   %dipsy = load i64, i64* @c
+; CHECK-NEXT:   br label %ph
+; CHECK: back:                                             ; preds = %l2
+; CHECK-NEXT:   store i8 undef, i8* null
+; CHECK-NEXT:   br label %ph
+; CHECK: end:                                              ; preds = %l2
+; CHECK-NEXT:   ret void
+; CHECK: l2:                                               ; preds = %entry
+; CHECK-NEXT:   br i1 false, label %back, label %end
+; CHECK-NEXT: }
+
+define void @tinkywinky() {
+entry:
+  br i1 undef, label %l2, label %if.then
+if.then:
+  br label %for.body
+ph:
+  br label %for.body
+for.body:
+  br i1 undef, label %ontrue, label %onfalse
+onfalse:
+  %patatino = load i64, i64* @c
+  store i64 %patatino, i64* @c
+  ret void
+ontrue:
+  %dipsy = load i64, i64* @c
+  store i64 %dipsy, i64* @c
+  br label %ph
+back:
+  br label %ph
+end:
+  ret void
+l2:
+  br i1 false, label %back, label %end
+}
diff --git a/test/Transforms/NewGVN/pr33086.ll b/test/Transforms/NewGVN/pr33086.ll
new file mode 100644
index 000000000000..6117ef35e6de
--- /dev/null
+++ b/test/Transforms/NewGVN/pr33086.ll
@@ -0,0 +1,59 @@
+; RUN: opt -newgvn -S %s | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK: entry:
+; CHECK-NEXT:   br i1 undef, label %for.cond18, label %for.cond.preheader
+; CHECK: for.cond.preheader:
+; CHECK-NEXT:   br label %for.cond2thread-pre-split
+; CHECK: for.cond2thread-pre-split:
+; CHECK-NEXT:   %conv24 = phi i32 [ 0, %for.cond.preheader ], [ %conv, %for.inc.split ]
+; CHECK-NEXT:   br label %for.inc.split
+; CHECK: for.inc.split:
+; CHECK-NEXT:   %add = shl nsw i32 %conv24, 16
+; CHECK-NEXT:   %sext23 = add i32 %add, 65536
+; CHECK-NEXT:   %conv = ashr exact i32 %sext23, 16
+; CHECK-NEXT:   %cmp = icmp slt i32 %sext23, 3604480
+; CHECK-NEXT:   br i1 %cmp, label %for.cond2thread-pre-split, label %l1.loopexit
+; CHECK: l1.loopexit:
+; CHECK-NEXT:   br label %l1
+; CHECK: l1:
+; CHECK-NEXT:   %0 = load i16, i16* null, align 2
+; CHECK-NEXT:   %g.0.g.0..pr = load i16, i16* null, align 2
+; CHECK-NEXT:   ret void
+; CHECK: for.cond18:
+; CHECK-NEXT:   br label %l1
+; CHECK-NEXT: }
+
+define void @tinkywinky() {
+entry:
+  br i1 undef, label %for.cond18, label %for.cond.preheader
+
+for.cond.preheader:
+  br label %for.cond2thread-pre-split
+
+for.cond2thread-pre-split:
+  %conv24 = phi i32 [ 0, %for.cond.preheader ], [ %conv, %for.inc.split ]
+  br label %for.inc.split
+
+for.inc.split:
+  %add = shl nsw i32 %conv24, 16
+  %sext23 = add i32 %add, 65536
+  %conv = ashr exact i32 %sext23, 16
+  %cmp = icmp slt i32 %sext23, 3604480
+  br i1 %cmp, label %for.cond2thread-pre-split, label %l1.loopexit
+
+l1.loopexit:
+  br label %l1
+
+l1:
+  %h.0 = phi i16* [ undef, %for.cond18 ], [ null, %l1.loopexit ]
+  %0 = load i16, i16* %h.0, align 2
+  store i16 %0, i16* null, align 2
+  %g.0.g.0..pr = load i16, i16* null, align 2
+  %tobool15 = icmp eq i16 %g.0.g.0..pr, 0
+  ret void
+
+for.cond18:
+  br label %l1
+}
diff --git a/test/Transforms/NewGVN/pr33116.ll b/test/Transforms/NewGVN/pr33116.ll
new file mode 100644
index 000000000000..9bf6bb1ff6ef
--- /dev/null
+++ b/test/Transforms/NewGVN/pr33116.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -newgvn %s | FileCheck %s
+
+@a = external global i32
+
+define void @b() {
+; CHECK-LABEL: @b(
+; CHECK-NEXT:    br i1 false, label [[C:%.*]], label [[WHILE_D:%.*]]
+; CHECK:       while.d:
+; CHECK-NEXT:    br label [[F:%.*]]
+; CHECK:       f:
+; CHECK-NEXT:    br i1 undef, label [[IF_E:%.*]], label [[C]]
+; CHECK:       c:
+; CHECK-NEXT:    br i1 undef, label [[IF_G:%.*]], label [[IF_E]]
+; CHECK:       if.g:
+; CHECK-NEXT:    store i32 undef, i32* @a
+; CHECK-NEXT:    br label [[WHILE_D]]
+; CHECK:       if.e:
+; CHECK-NEXT:    br label [[F]]
+;
+  br i1 false, label %c, label %while.d
+
+while.d:                                          ; preds = %if.g, %0
+  br label %f
+
+f:                                                ; preds = %if.e, %while.d
+  br i1 undef, label %if.e, label %c
+
+c:                                                ; preds = %f, %0
+  br i1 undef, label %if.g, label %if.e
+
+if.g:                                             ; preds = %c
+  store i32 undef, i32* @a
+  br label %while.d
+
+if.e:                                             ; preds = %c, %f
+  br label %f
+}
+
diff --git a/test/Transforms/NewGVN/storeoverstore.ll b/test/Transforms/NewGVN/storeoverstore.ll
index 49b55d430dc7..28f5eea03ced 100644
--- a/test/Transforms/NewGVN/storeoverstore.ll
+++ b/test/Transforms/NewGVN/storeoverstore.ll
@@ -13,11 +13,11 @@ define i32 @foo(i32*, i32)  {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
 ; CHECK:         br label [[TMP5]]
-; CHECK:         [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP2:%.*]] ]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP6:%.*]], label [[TMP8:%.*]]
-; CHECK:         [[TMP7:%.*]] = add nsw i32 [[DOT0]], 5
-; CHECK-NEXT:    br label [[TMP8]]
-; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP7]], [[TMP6]] ], [ [[DOT0]], [[TMP5]] ]
+; CHECK:         [[TMP6:%.*]] = phi i32 [ 15, [[TMP4]] ], [ 10, [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP2]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP8:%.*]]
+; CHECK:         br label [[TMP8]]
+; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP6]], [[TMP7]] ], [ [[DOT0]], [[TMP5]] ]
 ; CHECK-NEXT:    ret i32 [[DOT1]]
 ;
   store i32 5, i32* %0, align 4
@@ -54,11 +54,11 @@ define i32 @foo2(i32*, i32)  {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
 ; CHECK:         br label [[TMP6:%.*]]
 ; CHECK:         br label [[TMP6]]
-; CHECK:         [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP5]] ]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP9:%.*]]
-; CHECK:         [[TMP8:%.*]] = add nsw i32 [[DOT0]], 5
-; CHECK-NEXT:    br label [[TMP9]]
-; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ [[DOT0]], [[TMP6]] ]
+; CHECK:         [[TMP7:%.*]] = phi i32 [ 15, [[TMP4]] ], [ 10, [[TMP5]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP5]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]]
+; CHECK:         br label [[TMP9]]
+; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP7]], [[TMP8]] ], [ [[DOT0]], [[TMP6]] ]
 ; CHECK-NEXT:    ret i32 [[DOT1]]
 ;
   store i32 5, i32* %0, align 4
diff --git a/test/Transforms/SafeStack/X86/coloring-ssp.ll b/test/Transforms/SafeStack/X86/coloring-ssp.ll
index 3b04fdf13fbc..040632e7526d 100644
--- a/test/Transforms/SafeStack/X86/coloring-ssp.ll
+++ b/test/Transforms/SafeStack/X86/coloring-ssp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -safe-stack-coloring=1 -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
 
 ; %x and %y share a stack slot between them, but not with the stack guard.
 define void @f() safestack sspreq {
diff --git a/test/Transforms/SafeStack/X86/coloring.ll b/test/Transforms/SafeStack/X86/coloring.ll
index 76bdf37dbf4e..60e960e693d5 100644
--- a/test/Transforms/SafeStack/X86/coloring.ll
+++ b/test/Transforms/SafeStack/X86/coloring.ll
@@ -1,5 +1,5 @@
-; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -safe-stack-coloring=1 -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -safe-stack-coloring=1 -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
 
 define void @f() safestack {
 entry:
diff --git a/test/Transforms/SafeStack/X86/coloring2.ll b/test/Transforms/SafeStack/X86/coloring2.ll
index 2a8f871945ff..ef00d9b54715 100644
--- a/test/Transforms/SafeStack/X86/coloring2.ll
+++ b/test/Transforms/SafeStack/X86/coloring2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -safe-stack-coloring=1 -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -safe-stack-coloring=1 -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
 
 ; x and y share the stack slot.
 define void @f() safestack {
diff --git a/test/Transforms/SafeStack/X86/layout-frag.ll b/test/Transforms/SafeStack/X86/layout-frag.ll
index b127defc2c5d..b9831c26b74c 100644
--- a/test/Transforms/SafeStack/X86/layout-frag.ll
+++ b/test/Transforms/SafeStack/X86/layout-frag.ll
@@ -1,5 +1,5 @@
 ; Test that safestack layout reuses a region w/o fragmentation.
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -safe-stack-coloring=1 -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
 
 define void @f() safestack {
 ; CHECK-LABEL: define void @f
diff --git a/test/tools/llvm-cvtres/Inputs/cursor_small.bmp b/test/tools/llvm-cvtres/Inputs/cursor_small.bmp
new file mode 100644
index 000000000000..ce513261bc2c
--- /dev/null
+++ b/test/tools/llvm-cvtres/Inputs/cursor_small.bmp
diff --git a/test/tools/llvm-cvtres/Inputs/okay_small.bmp b/test/tools/llvm-cvtres/Inputs/okay_small.bmp
new file mode 100644
index 000000000000..e4005bf5ef97
--- /dev/null
+++ b/test/tools/llvm-cvtres/Inputs/okay_small.bmp
diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.rc b/test/tools/llvm-cvtres/Inputs/test_resource.rc
new file mode 100644
index 000000000000..fd616520dbe1
--- /dev/null
+++ b/test/tools/llvm-cvtres/Inputs/test_resource.rc
@@ -0,0 +1,44 @@
+#include "windows.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+
+myaccelerators ACCELERATORS
+{
+	"^C", 999, VIRTKEY, ALT
+	"D", 1100, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+cursor BITMAP "cursor_small.bmp"
+okay BITMAP "okay_small.bmp"
+
+14432 MENU
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+{
+	MENUITEM "yu", 100
+	MENUITEM "shala", 101
+	MENUITEM "kaoya", 102
+}
+
+testdialog DIALOG 10, 10, 200, 300
+STYLE WS_POPUP | WS_BORDER
+CAPTION "Test"
+{
+	CTEXT "Continue:", 1, 10, 10, 230, 14
+	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
+}
+
+12 ACCELERATORS
+{
+	"X", 164, VIRTKEY, ALT
+	"H", 5678, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+"eat" MENU
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
+{
+	MENUITEM "fish", 100
+	MENUITEM "salad", 101
+	MENUITEM "duck", 102
+}
diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.res b/test/tools/llvm-cvtres/Inputs/test_resource.res
new file mode 100644
index 000000000000..c577ecc3d633
--- /dev/null
+++ b/test/tools/llvm-cvtres/Inputs/test_resource.res
diff --git a/test/tools/llvm-cvtres/resource.test b/test/tools/llvm-cvtres/resource.test
new file mode 100644
index 000000000000..16970343c60d
--- /dev/null
+++ b/test/tools/llvm-cvtres/resource.test
@@ -0,0 +1,7 @@
+// The input was generated with the following command, using the original Windows
+// rc.exe:
+// > rc /fo test_resource.res /nologo test_resource.rc
+
+RUN: llvm-cvtres %p/Inputs/test_resource.res | FileCheck %s
+
+CHECK: Number of resources: 7
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 8c786950036f..589005943045 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -356,9 +356,7 @@ static bool addPass(PassManagerBase &PM, const char *argv0,
   }
 
   Pass *P;
-  if (PI->getTargetMachineCtor())
-    P = PI->getTargetMachineCtor()(&TPC.getTM<TargetMachine>());
-  else if (PI->getNormalCtor())
+  if (PI->getNormalCtor())
     P = PI->getNormalCtor()();
   else {
     errs() << argv0 << ": cannot create pass: " << PI->getPassName() << "\n";
diff --git a/tools/llvm-cvtres/CMakeLists.txt b/tools/llvm-cvtres/CMakeLists.txt
index 52edccac8165..e912030e205e 100644
--- a/tools/llvm-cvtres/CMakeLists.txt
+++ b/tools/llvm-cvtres/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  Object
   Option
   Support
   )
diff --git a/tools/llvm-cvtres/llvm-cvtres.cpp b/tools/llvm-cvtres/llvm-cvtres.cpp
index f03e0b772e1b..96f7437ab5f6 100644
--- a/tools/llvm-cvtres/llvm-cvtres.cpp
+++ b/tools/llvm-cvtres/llvm-cvtres.cpp
@@ -14,17 +14,23 @@
 
 #include "llvm-cvtres.h"
 
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/WindowsResource.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Support/BinaryStreamError.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace object;
 
 namespace {
 
@@ -61,6 +67,28 @@ public:
 static ExitOnError ExitOnErr;
 }
 
+LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) {
+  errs() << Msg;
+  exit(1);
+}
+
+static void reportError(StringRef Input, std::error_code EC) {
+  reportError(Twine(Input) + ": " + EC.message() + ".\n");
+}
+
+void error(std::error_code EC) {
+  if (!EC)
+    return;
+  reportError(EC.message() + ".\n");
+}
+
+void error(Error EC) {
+  if (!EC)
+    return;
+  handleAllErrors(std::move(EC),
+                  [&](const ErrorInfoBase &EI) { reportError(EI.message()); });
+}
+
 int main(int argc_, const char *argv_[]) {
   sys::PrintStackTraceOnErrorSignal(argv_[0]);
   PrettyStackTraceProgram X(argc_, argv_);
@@ -76,11 +104,79 @@ int main(int argc_, const char *argv_[]) {
 
   CvtResOptTable T;
   unsigned MAI, MAC;
-  ArrayRef<const char *> ArgsArr = makeArrayRef(argv_, argc_);
+  ArrayRef<const char *> ArgsArr = makeArrayRef(argv_ + 1, argc_);
   opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
 
-  if (InputArgs.hasArg(OPT_HELP))
+  if (InputArgs.hasArg(OPT_HELP)) {
     T.PrintHelp(outs(), "cvtres", "Resource Converter", false);
-
+    return 0;
+  }
+
+  machine Machine;
+
+  if (InputArgs.hasArg(OPT_MACHINE)) {
+    std::string MachineString = InputArgs.getLastArgValue(OPT_MACHINE).upper();
+    Machine = StringSwitch<machine>(MachineString)
+                  .Case("ARM", machine::ARM)
+                  .Case("X64", machine::X64)
+                  .Case("X86", machine::X86)
+                  .Default(machine::UNKNOWN);
+    if (Machine == machine::UNKNOWN)
+      reportError("Unsupported machine architecture");
+  } else {
+    outs() << "Machine architecture not specified; assumed X64.\n";
+    Machine = machine::X64;
+  }
+
+  std::vector<std::string> InputFiles = InputArgs.getAllArgValues(OPT_INPUT);
+
+  if (InputFiles.size() == 0) {
+    reportError("No input file specified");
+  }
+
+  SmallString<128> OutputFile;
+
+  if (InputArgs.hasArg(OPT_OUT)) {
+    OutputFile = InputArgs.getLastArgValue(OPT_OUT);
+  } else {
+    OutputFile = StringRef(InputFiles[0]);
+    llvm::sys::path::replace_extension(OutputFile, ".obj");
+  }
+
+  for (const auto &File : InputFiles) {
+    Expected<object::OwningBinary<object::Binary>> BinaryOrErr =
+        object::createBinary(File);
+    if (!BinaryOrErr)
+      reportError(File, errorToErrorCode(BinaryOrErr.takeError()));
+
+    Binary &Binary = *BinaryOrErr.get().getBinary();
+
+    WindowsResource *RF = dyn_cast<WindowsResource>(&Binary);
+    if (!RF)
+      reportError(File + ": unrecognized file format.\n");
+
+    int EntryNumber = 0;
+    Expected<ResourceEntryRef> EntryOrErr = RF->getHeadEntry();
+    if (!EntryOrErr)
+      error(EntryOrErr.takeError());
+    ResourceEntryRef Entry = EntryOrErr.get();
+    bool End = false;
+    while (!End) {
+      error(Entry.moveNext(End));
+      EntryNumber++;
+    }
+    outs() << "Number of resources: " << EntryNumber << "\n";
+  }
+  outs() << "Machine: ";
+  switch (Machine) {
+  case machine::ARM:
+    outs() << "ARM\n";
+    break;
+  case machine::X86:
+    outs() << "X86\n";
+    break;
+  default:
+    outs() << "X64\n";
+  }
   return 0;
 }
diff --git a/tools/llvm-cvtres/llvm-cvtres.h b/tools/llvm-cvtres/llvm-cvtres.h
index eeaba1969036..2e45b66461f0 100644
--- a/tools/llvm-cvtres/llvm-cvtres.h
+++ b/tools/llvm-cvtres/llvm-cvtres.h
@@ -10,4 +10,10 @@
 #ifndef LLVM_TOOLS_LLVMCVTRES_LLVMCVTRES_H
 #define LLVM_TOOLS_LLVMCVTRES_LLVMCVTRES_H
 
+#include <system_error>
+
+void error(std::error_code EC);
+
+enum class machine { UNKNOWN = 0, ARM, X64, X86 };
+
 #endif
diff --git a/tools/llvm-dwp/llvm-dwp.cpp b/tools/llvm-dwp/llvm-dwp.cpp
index 0c4af7576608..f8d00b3b5534 100644
--- a/tools/llvm-dwp/llvm-dwp.cpp
+++ b/tools/llvm-dwp/llvm-dwp.cpp
@@ -373,7 +373,7 @@ handleCompressedSection(std::deque<SmallString<32>> &UncompressedSections,
     return createError(Name, Dec.takeError());
 
   UncompressedSections.emplace_back();
-  if (Error E = Dec->decompress(UncompressedSections.back()))
+  if (Error E = Dec->resizeAndDecompress(UncompressedSections.back()))
     return createError(Name, std::move(E));
 
   Name = Name.substr(2); // Drop ".z"
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 2458d3d123ca..ccc673be4570 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -669,24 +669,30 @@ private:
     if (!ThinLTOIndex.empty())
       errs() << "Warning: -thinlto-index ignored for codegen stage";
 
+    std::vector<std::unique_ptr<MemoryBuffer>> InputBuffers;
     for (auto &Filename : InputFilenames) {
       LLVMContext Ctx;
-      auto TheModule = loadModule(Filename, Ctx);
-
-      auto Buffer = ThinGenerator.codegen(*TheModule);
+      auto InputOrErr = MemoryBuffer::getFile(Filename);
+      error(InputOrErr, "error " + CurrentActivity);
+      InputBuffers.push_back(std::move(*InputOrErr));
+      ThinGenerator.addModule(Filename, InputBuffers.back()->getBuffer());
+    }
+    ThinGenerator.setCodeGenOnly(true);
+    ThinGenerator.run();
+    for (auto BinName :
+         zip(ThinGenerator.getProducedBinaries(), InputFilenames)) {
       std::string OutputName = OutputFilename;
-      if (OutputName.empty()) {
-        OutputName = Filename + ".thinlto.o";
-      }
-      if (OutputName == "-") {
-        outs() << Buffer->getBuffer();
+      if (OutputName.empty())
+        OutputName = std::get<1>(BinName) + ".thinlto.o";
+      else if (OutputName == "-") {
+        outs() << std::get<0>(BinName)->getBuffer();
         return;
       }
 
       std::error_code EC;
       raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::F_None);
       error(EC, "error opening the file '" + OutputName + "'");
-      OS << Buffer->getBuffer();
+      OS << std::get<0>(BinName)->getBuffer();
     }
   }
 
diff --git a/tools/llvm-pdbdump/Analyze.cpp b/tools/llvm-pdbdump/Analyze.cpp
index ab4477ed7bad..3a026e5d2451 100644
--- a/tools/llvm-pdbdump/Analyze.cpp
+++ b/tools/llvm-pdbdump/Analyze.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
diff --git a/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp b/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp
index 5ad0bfad26c1..3b609ae50c1c 100644
--- a/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp
+++ b/tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp
@@ -29,15 +29,15 @@ static StringRef getLeafName(TypeLeafKind K) {
   return StringRef();
 }
 
-CompactTypeDumpVisitor::CompactTypeDumpVisitor(TypeDatabase &TypeDB,
+CompactTypeDumpVisitor::CompactTypeDumpVisitor(TypeCollection &Types,
                                                ScopedPrinter *W)
-    : CompactTypeDumpVisitor(TypeDB, TypeIndex(TypeIndex::FirstNonSimpleIndex),
+    : CompactTypeDumpVisitor(Types, TypeIndex(TypeIndex::FirstNonSimpleIndex),
                              W) {}
 
-CompactTypeDumpVisitor::CompactTypeDumpVisitor(TypeDatabase &TypeDB,
+CompactTypeDumpVisitor::CompactTypeDumpVisitor(TypeCollection &Types,
                                                TypeIndex FirstTI,
                                                ScopedPrinter *W)
-    : W(W), TI(FirstTI), Offset(0), TypeDB(TypeDB) {}
+    : W(W), TI(FirstTI), Offset(0), Types(Types) {}
 
 Error CompactTypeDumpVisitor::visitTypeBegin(CVType &Record) {
   return Error::success();
@@ -46,7 +46,7 @@ Error CompactTypeDumpVisitor::visitTypeBegin(CVType &Record) {
 Error CompactTypeDumpVisitor::visitTypeEnd(CVType &Record) {
   uint32_t I = TI.getIndex();
   StringRef Leaf = getLeafName(Record.Type);
-  StringRef Name = TypeDB.getTypeName(TI);
+  StringRef Name = Types.getTypeName(TI);
   W->printString(
       llvm::formatv("Index: {0:x} ({1:N} bytes, offset {2:N}) {3} \"{4}\"", I,
                     Record.length(), Offset, Leaf, Name)
diff --git a/tools/llvm-pdbdump/CompactTypeDumpVisitor.h b/tools/llvm-pdbdump/CompactTypeDumpVisitor.h
index 76fafc93e030..41ccea0c2e90 100644
--- a/tools/llvm-pdbdump/CompactTypeDumpVisitor.h
+++ b/tools/llvm-pdbdump/CompactTypeDumpVisitor.h
@@ -17,7 +17,7 @@
 namespace llvm {
 class ScopedPrinter;
 namespace codeview {
-class TypeDatabase;
+class TypeCollection;
 }
 
 namespace pdb {
@@ -26,8 +26,8 @@ namespace pdb {
 /// Dumps records on a single line, and ignores member records.
 class CompactTypeDumpVisitor : public codeview::TypeVisitorCallbacks {
 public:
-  CompactTypeDumpVisitor(codeview::TypeDatabase &TypeDB, ScopedPrinter *W);
-  CompactTypeDumpVisitor(codeview::TypeDatabase &TypeDB,
+  CompactTypeDumpVisitor(codeview::TypeCollection &Types, ScopedPrinter *W);
+  CompactTypeDumpVisitor(codeview::TypeCollection &Types,
                          codeview::TypeIndex FirstTI, ScopedPrinter *W);
 
   /// Paired begin/end actions for all types. Receives all record data,
@@ -40,7 +40,7 @@ private:
 
   codeview::TypeIndex TI;
   uint32_t Offset;
-  codeview::TypeDatabase &TypeDB;
+  codeview::TypeCollection &Types;
 };
 
 } // end namespace pdb
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index c4fecb80ea5a..07d3b226d7c6 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -14,9 +14,9 @@
 #include "StreamUtil.h"
 #include "llvm-pdbdump.h"
 
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h"
@@ -25,7 +25,6 @@
 #include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
@@ -84,7 +83,7 @@ struct PageStats {
 
 class C13RawVisitor : public C13DebugFragmentVisitor {
 public:
-  C13RawVisitor(ScopedPrinter &P, PDBFile &F, TypeDatabase &IPI)
+  C13RawVisitor(ScopedPrinter &P, PDBFile &F, LazyRandomTypeCollection &IPI)
       : C13DebugFragmentVisitor(F), P(P), IPI(IPI) {}
 
   Error handleLines() override {
@@ -160,7 +159,7 @@ public:
         if (auto EC = printFileName("FileName", L.Header->FileID))
           return EC;
 
-        if (auto EC = dumpTypeRecord("Function", IPI, L.Header->Inlinee))
+        if (auto EC = dumpTypeRecord("Function", L.Header->Inlinee))
           return EC;
         P.printNumber("SourceLine", L.Header->SourceLineNum);
         if (IL.hasExtraFiles()) {
@@ -176,11 +175,11 @@ public:
   }
 
 private:
-  Error dumpTypeRecord(StringRef Label, TypeDatabase &DB, TypeIndex Index) {
-    CompactTypeDumpVisitor CTDV(DB, Index, &P);
+  Error dumpTypeRecord(StringRef Label, TypeIndex Index) {
+    CompactTypeDumpVisitor CTDV(IPI, Index, &P);
     DictScope D(P, Label);
-    if (DB.contains(Index)) {
-      CVType &Type = DB.getTypeRecord(Index);
+    if (IPI.contains(Index)) {
+      CVType Type = IPI.getType(Index);
       if (auto EC = codeview::visitTypeRecord(Type, CTDV))
         return EC;
     } else {
@@ -199,7 +198,7 @@ private:
   }
 
   ScopedPrinter &P;
-  TypeDatabase &IPI;
+  LazyRandomTypeCollection &IPI;
 };
 }
 
@@ -609,14 +608,19 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
     VerLabel = "IPI Version";
   }
 
-  if (!DumpRecordBytes && !DumpRecords && !DumpTpiHash)
-    return Error::success();
-
   auto Tpi = (StreamIdx == StreamTPI) ? File.getPDBTpiStream()
                                       : File.getPDBIpiStream();
   if (!Tpi)
     return Tpi.takeError();
 
+  auto ExpectedTypes = initializeTypeDatabase(StreamIdx);
+  if (!ExpectedTypes)
+    return ExpectedTypes.takeError();
+  auto &Types = *ExpectedTypes;
+
+  if (!DumpRecordBytes && !DumpRecords && !DumpTpiHash)
+    return Error::success();
+
   std::unique_ptr<DictScope> StreamScope;
   std::unique_ptr<ListScope> RecordScope;
 
@@ -624,25 +628,19 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
   P.printNumber(VerLabel, Tpi->getTpiVersion());
   P.printNumber("Record count", Tpi->getNumTypeRecords());
 
-  Optional<TypeDatabase> &StreamDB = (StreamIdx == StreamTPI) ? TypeDB : ItemDB;
-
   std::vector<std::unique_ptr<TypeVisitorCallbacks>> Visitors;
 
-  if (!StreamDB.hasValue()) {
-    StreamDB.emplace(Tpi->getNumTypeRecords());
-    Visitors.push_back(make_unique<TypeDatabaseVisitor>(*StreamDB));
-  }
   // If we're in dump mode, add a dumper with the appropriate detail level.
   if (DumpRecords) {
     std::unique_ptr<TypeVisitorCallbacks> Dumper;
     if (opts::raw::CompactRecords)
-      Dumper = make_unique<CompactTypeDumpVisitor>(*StreamDB, &P);
+      Dumper = make_unique<CompactTypeDumpVisitor>(Types, &P);
     else {
-      assert(TypeDB.hasValue());
+      assert(TpiTypes);
 
-      auto X = make_unique<TypeDumpVisitor>(*TypeDB, &P, false);
+      auto X = make_unique<TypeDumpVisitor>(*TpiTypes, &P, false);
       if (StreamIdx == StreamIPI)
-        X->setItemDB(*ItemDB);
+        X->setIpiTypes(*IpiTypes);
       Dumper = std::move(X);
     }
     Visitors.push_back(std::move(Dumper));
@@ -660,23 +658,18 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
   if (DumpRecords || DumpRecordBytes)
     RecordScope = llvm::make_unique<ListScope>(P, "Records");
 
-  bool HadError = false;
-
-  TypeIndex T(TypeIndex::FirstNonSimpleIndex);
-  for (auto Type : Tpi->types(&HadError)) {
+  Optional<TypeIndex> I = Types.getFirst();
+  while (I) {
     std::unique_ptr<DictScope> OneRecordScope;
 
     if ((DumpRecords || DumpRecordBytes) && !opts::raw::CompactRecords)
       OneRecordScope = llvm::make_unique<DictScope>(P, "");
 
-    if (auto EC = codeview::visitTypeRecord(Type, Pipeline))
+    auto T = Types.getType(*I);
+    if (auto EC = codeview::visitTypeRecord(T, *I, Pipeline))
       return EC;
-
-    ++T;
+    I = Types.getNext(*I);
   }
-  if (HadError)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "TPI stream contained corrupt record");
 
   if (DumpTpiHash) {
     DictScope DD(P, "Hash");
@@ -711,35 +704,26 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
   return Error::success();
 }
 
-Error LLVMOutputStyle::buildTypeDatabase(uint32_t SN) {
-  assert(SN == StreamIPI || SN == StreamTPI);
-
-  auto &DB = (SN == StreamIPI) ? ItemDB : TypeDB;
-
-  if (DB.hasValue())
-    return Error::success();
-
+Expected<codeview::LazyRandomTypeCollection &>
+LLVMOutputStyle::initializeTypeDatabase(uint32_t SN) {
+  auto &TypeCollection = (SN == StreamTPI) ? TpiTypes : IpiTypes;
   auto Tpi =
       (SN == StreamTPI) ? File.getPDBTpiStream() : File.getPDBIpiStream();
-
   if (!Tpi)
     return Tpi.takeError();
 
-  DB.emplace(Tpi->getNumTypeRecords());
-
-  TypeDatabaseVisitor DBV(*DB);
-
-  auto HashValues = Tpi->getHashValues();
-  if (HashValues.empty())
-    return codeview::visitTypeStream(Tpi->typeArray(), DBV);
-
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(DBV);
-
-  TpiHashVerifier HashVerifier(HashValues, Tpi->getNumHashBuckets());
-  Pipeline.addCallbackToPipeline(HashVerifier);
+  if (!TypeCollection) {
+    // Initialize the type collection, even if we're not going to dump it.  This
+    // way if some other part of the dumper decides it wants to use some or all
+    // of the records for whatever purposes, it can still access them lazily.
+    auto &Types = Tpi->typeArray();
+    uint32_t Count = Tpi->getNumTypeRecords();
+    auto Offsets = Tpi->getTypeIndexOffsets();
+    TypeCollection =
+        llvm::make_unique<LazyRandomTypeCollection>(Types, Count, Offsets);
+  }
 
-  return codeview::visitTypeStream(Tpi->typeArray(), Pipeline);
+  return *TypeCollection;
 }
 
 Error LLVMOutputStyle::dumpDbiStream() {
@@ -814,11 +798,13 @@ Error LLVMOutputStyle::dumpDbiStream() {
           return EC;
 
         if (ShouldDumpSymbols) {
-          if (auto EC = buildTypeDatabase(StreamTPI))
-            return EC;
+          auto ExpectedTypes = initializeTypeDatabase(StreamTPI);
+          if (!ExpectedTypes)
+            return ExpectedTypes.takeError();
+          auto &Types = *ExpectedTypes;
 
           ListScope SS(P, "Symbols");
-          codeview::CVSymbolDumper SD(P, *TypeDB, nullptr, false);
+          codeview::CVSymbolDumper SD(P, Types, nullptr, false);
           bool HadError = false;
           for (auto S : ModS.symbols(&HadError)) {
             DictScope LL(P, "");
@@ -839,10 +825,11 @@ Error LLVMOutputStyle::dumpDbiStream() {
         }
         if (opts::raw::DumpLineInfo) {
           ListScope SS(P, "LineInfo");
-          if (auto EC = buildTypeDatabase(StreamIPI))
-            return EC;
-
-          C13RawVisitor V(P, File, *ItemDB);
+          auto ExpectedTypes = initializeTypeDatabase(StreamIPI);
+          if (!ExpectedTypes)
+            return ExpectedTypes.takeError();
+          auto &IpiItems = *ExpectedTypes;
+          C13RawVisitor V(P, File, IpiItems);
           if (auto EC = codeview::visitModuleDebugFragments(
                   ModS.linesAndChecksums(), V))
             return EC;
@@ -960,10 +947,12 @@ Error LLVMOutputStyle::dumpPublicsStream() {
   P.printList("Section Offsets", Publics->getSectionOffsets(),
               printSectionOffset);
   ListScope L(P, "Symbols");
-  if (auto EC = buildTypeDatabase(StreamTPI))
-    return EC;
+  auto ExpectedTypes = initializeTypeDatabase(StreamTPI);
+  if (!ExpectedTypes)
+    return ExpectedTypes.takeError();
+  auto &Tpi = *ExpectedTypes;
 
-  codeview::CVSymbolDumper SD(P, *TypeDB, nullptr, false);
+  codeview::CVSymbolDumper SD(P, Tpi, nullptr, false);
   bool HadError = false;
   for (auto S : Publics->getSymbols(&HadError)) {
     DictScope DD(P, "");
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.h b/tools/llvm-pdbdump/LLVMOutputStyle.h
index b0e7e3406b36..184dc4e1f44d 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.h
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.h
@@ -21,6 +21,11 @@
 
 namespace llvm {
 class BitVector;
+
+namespace codeview {
+class LazyRandomTypeCollection;
+}
+
 namespace pdb {
 class LLVMOutputStyle : public OutputStyle {
 public:
@@ -29,7 +34,8 @@ public:
   Error dump() override;
 
 private:
-  Error buildTypeDatabase(uint32_t SN);
+  Expected<codeview::LazyRandomTypeCollection &>
+  initializeTypeDatabase(uint32_t SN);
 
   Error dumpFileHeaders();
   Error dumpStreamSummary();
@@ -54,8 +60,8 @@ private:
 
   PDBFile &File;
   ScopedPrinter P;
-  Optional<codeview::TypeDatabase> TypeDB;
-  Optional<codeview::TypeDatabase> ItemDB;
+  std::unique_ptr<codeview::LazyRandomTypeCollection> TpiTypes;
+  std::unique_ptr<codeview::LazyRandomTypeCollection> IpiTypes;
   SmallVector<std::string, 32> StreamPurposes;
 };
 }
diff --git a/tools/llvm-pdbdump/PdbYaml.cpp b/tools/llvm-pdbdump/PdbYaml.cpp
index 6527bec31a77..dd32eca14c1d 100644
--- a/tools/llvm-pdbdump/PdbYaml.cpp
+++ b/tools/llvm-pdbdump/PdbYaml.cpp
@@ -19,7 +19,6 @@
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -371,7 +370,6 @@ void MappingContextTraits<PdbInlineeInfo, SerializationContext>::mapping(
 void MappingContextTraits<PdbTpiRecord, pdb::yaml::SerializationContext>::
     mapping(IO &IO, pdb::yaml::PdbTpiRecord &Obj,
             pdb::yaml::SerializationContext &Context) {
-
   if (IO.outputting()) {
     // For PDB to Yaml, deserialize into a high level record type, then dump it.
     consumeError(codeview::visitTypeRecord(Obj.Record, Context.Dumper));
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.h b/tools/llvm-pdbdump/YAMLOutputStyle.h
index 517c7d86d7ab..068312aec450 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.h
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.h
@@ -13,7 +13,6 @@
 #include "OutputStyle.h"
 #include "PdbYaml.h"
 
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 
diff --git a/tools/llvm-pdbdump/YamlTypeDumper.cpp b/tools/llvm-pdbdump/YamlTypeDumper.cpp
index 3e447ca60b61..beb700720954 100644
--- a/tools/llvm-pdbdump/YamlTypeDumper.cpp
+++ b/tools/llvm-pdbdump/YamlTypeDumper.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index 0e5913fa3c93..1767c3cfda85 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -31,9 +31,12 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
+#include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
@@ -67,6 +70,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Regex.h"
@@ -99,6 +103,9 @@ cl::SubCommand
     AnalyzeSubcommand("analyze",
                       "Analyze various aspects of a PDB's structure");
 
+cl::SubCommand MergeSubcommand("merge",
+                               "Merge multiple PDBs into a single PDB");
+
 cl::OptionCategory TypeCategory("Symbol Type Options");
 cl::OptionCategory FilterCategory("Filtering and Sorting Options");
 cl::OptionCategory OtherOptions("Other Options");
@@ -365,9 +372,9 @@ cl::opt<std::string>
     YamlPdbOutputFile("pdb", cl::desc("the name of the PDB file to write"),
                       cl::sub(YamlToPdbSubcommand));
 
-cl::list<std::string> InputFilename(cl::Positional,
-                                    cl::desc("<input YAML file>"), cl::Required,
-                                    cl::sub(YamlToPdbSubcommand));
+cl::opt<std::string> InputFilename(cl::Positional,
+                                   cl::desc("<input YAML file>"), cl::Required,
+                                   cl::sub(YamlToPdbSubcommand));
 }
 
 namespace pdb2yaml {
@@ -440,6 +447,15 @@ cl::list<std::string> InputFilename(cl::Positional,
                                     cl::desc("<input PDB file>"), cl::Required,
                                     cl::sub(AnalyzeSubcommand));
 }
+
+namespace merge {
+cl::list<std::string> InputFilenames(cl::Positional,
+                                     cl::desc("<input PDB files>"),
+                                     cl::OneOrMore, cl::sub(MergeSubcommand));
+cl::opt<std::string>
+    PdbOutputFile("pdb", cl::desc("the name of the PDB file to write"),
+                  cl::sub(MergeSubcommand));
+}
 }
 
 static ExitOnError ExitOnErr;
@@ -827,6 +843,57 @@ static void dumpPretty(StringRef Path) {
   outs().flush();
 }
 
+static void mergePdbs() {
+  BumpPtrAllocator Allocator;
+  TypeTableBuilder MergedTpi(Allocator);
+  TypeTableBuilder MergedIpi(Allocator);
+
+  // Create a Tpi and Ipi type table with all types from all input files.
+  for (const auto &Path : opts::merge::InputFilenames) {
+    std::unique_ptr<IPDBSession> Session;
+    auto &File = loadPDB(Path, Session);
+    SmallVector<TypeIndex, 128> SourceToDest;
+    if (File.hasPDBTpiStream()) {
+      SourceToDest.clear();
+      auto &Tpi = ExitOnErr(File.getPDBTpiStream());
+      ExitOnErr(codeview::mergeTypeStreams(MergedIpi, MergedTpi, SourceToDest,
+                                           nullptr, Tpi.typeArray()));
+    }
+    if (File.hasPDBIpiStream()) {
+      SourceToDest.clear();
+      auto &Ipi = ExitOnErr(File.getPDBIpiStream());
+      ExitOnErr(codeview::mergeTypeStreams(MergedIpi, MergedTpi, SourceToDest,
+                                           nullptr, Ipi.typeArray()));
+    }
+  }
+
+  // Then write the PDB.
+  PDBFileBuilder Builder(Allocator);
+  ExitOnErr(Builder.initialize(4096));
+  // Add each of the reserved streams.  We might not put any data in them,
+  // but at least they have to be present.
+  for (uint32_t I = 0; I < kSpecialStreamCount; ++I)
+    ExitOnErr(Builder.getMsfBuilder().addStream(0));
+
+  auto &DestTpi = Builder.getTpiBuilder();
+  auto &DestIpi = Builder.getIpiBuilder();
+  MergedTpi.ForEachRecord(
+      [&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
+        DestTpi.addTypeRecord(Data, None);
+      });
+  MergedIpi.ForEachRecord(
+      [&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
+        DestIpi.addTypeRecord(Data, None);
+      });
+
+  SmallString<64> OutFile(opts::merge::PdbOutputFile);
+  if (OutFile.empty()) {
+    OutFile = opts::merge::InputFilenames[0];
+    llvm::sys::path::replace_extension(OutFile, "merged.pdb");
+  }
+  ExitOnErr(Builder.commit(OutFile));
+}
+
 int main(int argc_, const char *argv_[]) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal(argv_[0]);
@@ -894,7 +961,12 @@ int main(int argc_, const char *argv_[]) {
   if (opts::PdbToYamlSubcommand) {
     pdb2Yaml(opts::pdb2yaml::InputFilename.front());
   } else if (opts::YamlToPdbSubcommand) {
-    yamlToPdb(opts::yaml2pdb::InputFilename.front());
+    if (opts::yaml2pdb::YamlPdbOutputFile.empty()) {
+      SmallString<16> OutputFilename(opts::yaml2pdb::InputFilename.getValue());
+      sys::path::replace_extension(OutputFilename, ".pdb");
+      opts::yaml2pdb::YamlPdbOutputFile = OutputFilename.str();
+    }
+    yamlToPdb(opts::yaml2pdb::InputFilename);
   } else if (opts::AnalyzeSubcommand) {
     dumpAnalysis(opts::analyze::InputFilename.front());
   } else if (opts::PrettySubcommand) {
@@ -943,6 +1015,12 @@ int main(int argc_, const char *argv_[]) {
       exit(1);
     }
     diff(opts::diff::InputFilenames[0], opts::diff::InputFilenames[1]);
+  } else if (opts::MergeSubcommand) {
+    if (opts::merge::InputFilenames.size() < 2) {
+      errs() << "merge subcommand requires at least 2 input files.\n";
+      exit(1);
+    }
+    mergePdbs();
   }
 
   outs().flush();
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index aca7de840d80..78bfa558e4a3 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -22,8 +22,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
 #include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
@@ -34,19 +35,20 @@
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Path.h"
@@ -71,7 +73,7 @@ class COFFDumper : public ObjDumper {
 public:
   friend class COFFObjectDumpDelegate;
   COFFDumper(const llvm::object::COFFObjectFile *Obj, ScopedPrinter &Writer)
-      : ObjDumper(Writer), Obj(Obj), Writer(Writer), TypeDB(100) {}
+      : ObjDumper(Writer), Obj(Obj), Writer(Writer), Types(100) {}
 
   void printFileHeaders() override;
   void printSections() override;
@@ -107,7 +109,7 @@ private:
   void printFileNameForOffset(StringRef Label, uint32_t FileOffset);
   void printTypeIndex(StringRef FieldName, TypeIndex TI) {
     // Forward to CVTypeDumper for simplicity.
-    CVTypeDumper::printTypeIndex(Writer, FieldName, TI, TypeDB);
+    codeview::printTypeIndex(Writer, FieldName, TI, Types);
   }
 
   void printCodeViewSymbolsSubsection(StringRef Subsection,
@@ -155,14 +157,13 @@ private:
   bool RelocCached = false;
   RelocMapTy RelocMap;
 
-  BinaryByteStream ChecksumContents;
   VarStreamArray<FileChecksumEntry> CVFileChecksumTable;
 
-  BinaryByteStream StringTableContents;
   StringTableRef CVStringTable;
 
   ScopedPrinter &Writer;
-  TypeDatabase TypeDB;
+  BinaryByteStream TypeContents;
+  LazyRandomTypeCollection Types;
 };
 
 class COFFObjectDumpDelegate : public SymbolDumpDelegate {
@@ -775,14 +776,13 @@ void COFFDumper::initializeFileAndStringTables(BinaryStreamReader &Reader) {
 
     switch (ModuleDebugFragmentKind(SubType)) {
     case ModuleDebugFragmentKind::FileChecksums: {
-      ChecksumContents = BinaryByteStream(Contents, support::little);
-      BinaryStreamReader CSR(ChecksumContents);
+      BinaryStreamReader CSR(Contents, support::little);
       error(CSR.readArray(CVFileChecksumTable, CSR.getLength()));
       break;
     }
     case ModuleDebugFragmentKind::StringTable: {
-      StringTableContents = BinaryByteStream(Contents, support::little);
-      error(CVStringTable.initialize(StringTableContents));
+      BinaryStreamRef ST(Contents, support::little);
+      error(CVStringTable.initialize(ST));
     } break;
     default:
       break;
@@ -812,8 +812,7 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
   if (Magic != COFF::DEBUG_SECTION_MAGIC)
     return error(object_error::parse_failed);
 
-  BinaryByteStream FileAndStrings(Data, support::little);
-  BinaryStreamReader FSReader(FileAndStrings);
+  BinaryStreamReader FSReader(Data, support::little);
   initializeFileAndStringTables(FSReader);
 
   // TODO: Convert this over to using ModuleSubstreamVisitor.
@@ -889,8 +888,7 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
     }
     case ModuleDebugFragmentKind::FrameData: {
       // First four bytes is a relocation against the function.
-      BinaryByteStream S(Contents, llvm::support::little);
-      BinaryStreamReader SR(S);
+      BinaryStreamReader SR(Contents, llvm::support::little);
       const uint32_t *CodePtr;
       error(SR.readObject(CodePtr));
       StringRef LinkageName;
@@ -934,8 +932,7 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
     ListScope S(W, "FunctionLineTable");
     W.printString("LinkageName", Name);
 
-    BinaryByteStream LineTableInfo(FunctionLineTables[Name], support::little);
-    BinaryStreamReader Reader(LineTableInfo);
+    BinaryStreamReader Reader(FunctionLineTables[Name], support::little);
 
     ModuleDebugLineFragmentRef LineInfo;
     error(LineInfo.initialize(Reader));
@@ -982,12 +979,9 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
                                Subsection.bytes_end());
   auto CODD = llvm::make_unique<COFFObjectDumpDelegate>(*this, Section, Obj,
                                                         SectionContents);
-
-  CVSymbolDumper CVSD(W, TypeDB, std::move(CODD),
-                      opts::CodeViewSubsectionBytes);
-  BinaryByteStream Stream(BinaryData, llvm::support::little);
+  CVSymbolDumper CVSD(W, Types, std::move(CODD), opts::CodeViewSubsectionBytes);
   CVSymbolArray Symbols;
-  BinaryStreamReader Reader(Stream);
+  BinaryStreamReader Reader(BinaryData, llvm::support::little);
   if (auto EC = Reader.readArray(Symbols, Reader.getLength())) {
     consumeError(std::move(EC));
     W.flush();
@@ -1002,8 +996,7 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
 }
 
 void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
-  BinaryByteStream S(Subsection, llvm::support::little);
-  BinaryStreamReader SR(S);
+  BinaryStreamReader SR(Subsection, llvm::support::little);
   ModuleDebugFileChecksumFragmentRef Checksums;
   error(Checksums.initialize(SR));
 
@@ -1021,8 +1014,7 @@ void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
 }
 
 void COFFDumper::printCodeViewInlineeLines(StringRef Subsection) {
-  BinaryByteStream S(Subsection, llvm::support::little);
-  BinaryStreamReader SR(S);
+  BinaryStreamReader SR(Subsection, llvm::support::little);
   ModuleDebugInlineeLineFragmentRef Lines;
   error(Lines.initialize(SR));
 
@@ -1072,18 +1064,17 @@ void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVIDs,
       error(consume(Data, Magic));
       if (Magic != 4)
         error(object_error::parse_failed);
-      ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(Data.data()),
-                              Data.size());
-      BinaryByteStream Stream(Bytes, llvm::support::little);
+
       CVTypeArray Types;
-      BinaryStreamReader Reader(Stream);
+      BinaryStreamReader Reader(Data, llvm::support::little);
       if (auto EC = Reader.readArray(Types, Reader.getLength())) {
         consumeError(std::move(EC));
         W.flush();
         error(object_error::parse_failed);
       }
-
-      if (auto EC = mergeTypeStreams(CVIDs, CVTypes, nullptr, Types))
+      SmallVector<TypeIndex, 128> SourceToDest;
+      if (auto EC =
+              mergeTypeStreams(CVIDs, CVTypes, SourceToDest, nullptr, Types))
         return error(std::move(EC));
     }
   }
@@ -1105,12 +1096,11 @@ void COFFDumper::printCodeViewTypeSection(StringRef SectionName,
   if (Magic != COFF::DEBUG_SECTION_MAGIC)
     return error(object_error::parse_failed);
 
-  CVTypeDumper CVTD(TypeDB);
-  TypeDumpVisitor TDV(TypeDB, &W, opts::CodeViewSubsectionBytes);
-  if (auto EC = CVTD.dump({Data.bytes_begin(), Data.bytes_end()}, TDV)) {
-    W.flush();
-    error(llvm::errorToErrorCode(std::move(EC)));
-  }
+  Types.reset(Data);
+
+  TypeDumpVisitor TDV(Types, &W, opts::CodeViewSubsectionBytes);
+  error(codeview::visitTypeStream(Types, TDV));
+  W.flush();
 }
 
 void COFFDumper::printSections() {
@@ -1650,35 +1640,22 @@ void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
     TypeBuf.append(Record.begin(), Record.end());
   });
 
-  TypeDatabase TypeDB(CVTypes.records().size());
+  TypeTableCollection TpiTypes(CVTypes.records());
   {
     ListScope S(Writer, "MergedTypeStream");
-    CVTypeDumper CVTD(TypeDB);
-    TypeDumpVisitor TDV(TypeDB, &Writer, opts::CodeViewSubsectionBytes);
-    if (auto EC = CVTD.dump(
-            {TypeBuf.str().bytes_begin(), TypeBuf.str().bytes_end()}, TDV)) {
-      Writer.flush();
-      error(std::move(EC));
-    }
+    TypeDumpVisitor TDV(TpiTypes, &Writer, opts::CodeViewSubsectionBytes);
+    error(codeview::visitTypeStream(TpiTypes, TDV));
+    Writer.flush();
   }
 
   // Flatten the id stream and print it next. The ID stream refers to names from
   // the type stream.
-  SmallString<0> IDBuf;
-  IDTable.ForEachRecord([&](TypeIndex TI, ArrayRef<uint8_t> Record) {
-    IDBuf.append(Record.begin(), Record.end());
-  });
-
+  TypeTableCollection IpiTypes(IDTable.records());
   {
     ListScope S(Writer, "MergedIDStream");
-    TypeDatabase IDDB(IDTable.records().size());
-    CVTypeDumper CVTD(IDDB);
-    TypeDumpVisitor TDV(TypeDB, &Writer, opts::CodeViewSubsectionBytes);
-    TDV.setItemDB(IDDB);
-    if (auto EC = CVTD.dump(
-            {IDBuf.str().bytes_begin(), IDBuf.str().bytes_end()}, TDV)) {
-      Writer.flush();
-      error(std::move(EC));
-    }
+    TypeDumpVisitor TDV(TpiTypes, &Writer, opts::CodeViewSubsectionBytes);
+    TDV.setIpiTypes(IpiTypes);
+    error(codeview::visitTypeStream(IpiTypes, TDV));
+    Writer.flush();
   }
 }
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index c362dff3a3e0..bef197d603ca 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
@@ -579,6 +580,13 @@ int main(int argc, char **argv) {
     NoOutput = true;
   }
 
+  if (TM) {
+    // FIXME: We should dyn_cast this when supported.
+    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
+    Pass *TPC = LTM.createPassConfig(Passes);
+    Passes.add(TPC);
+  }
+
   // Create a new optimization pass for each one specified on the command line
   for (unsigned i = 0; i < PassList.size(); ++i) {
     if (StandardLinkOpts &&
@@ -619,9 +627,7 @@ int main(int argc, char **argv) {
 
     const PassInfo *PassInf = PassList[i];
     Pass *P = nullptr;
-    if (PassInf->getTargetMachineCtor())
-      P = PassInf->getTargetMachineCtor()(TM.get());
-    else if (PassInf->getNormalCtor())
+    if (PassInf->getNormalCtor())
       P = PassInf->getNormalCtor()();
     else
       errs() << argv[0] << ": cannot create pass: "
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index 5594955e7baf..05fad386064c 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -1002,6 +1002,64 @@ TEST(APIntTest, divrem_big7) {
           {224, "80000000800000010000000f", 16});
 }
 
+void testDiv(APInt a, uint64_t b, APInt c) {
+  auto p = a * b + c;
+
+  APInt q;
+  uint64_t r;
+  // Unsigned division will only work if our original number wasn't negative.
+  if (!a.isNegative()) {
+    q = p.udiv(b);
+    r = p.urem(b);
+    EXPECT_EQ(a, q);
+    EXPECT_EQ(c, r);
+    APInt::udivrem(p, b, q, r);
+    EXPECT_EQ(a, q);
+    EXPECT_EQ(c, r);
+  }
+  q = p.sdiv(b);
+  r = p.srem(b);
+  EXPECT_EQ(a, q);
+  if (c.isNegative())
+    EXPECT_EQ(-c, -r); // Need to negate so the uint64_t compare will work.
+  else
+    EXPECT_EQ(c, r);
+  int64_t sr;
+  APInt::sdivrem(p, b, q, sr);
+  EXPECT_EQ(a, q);
+  if (c.isNegative())
+    EXPECT_EQ(-c, -sr); // Need to negate so the uint64_t compare will work.
+  else
+    EXPECT_EQ(c, sr);
+}
+
+TEST(APIntTest, divremuint) {
+  // Single word APInt
+  testDiv(APInt{64, 9},
+          2,
+          APInt{64, 1});
+
+  // Single word negative APInt
+  testDiv(-APInt{64, 9},
+          2,
+          -APInt{64, 1});
+
+  // Multiword dividend with only one significant word.
+  testDiv(APInt{256, 9},
+          2,
+          APInt{256, 1});
+
+  // Negative dividend.
+  testDiv(-APInt{256, 9},
+          2,
+          -APInt{256, 1});
+
+  // Multiword dividend
+  testDiv(APInt{1024, 19}.shl(811),
+          4356013, // one word
+          APInt{1024, 1});
+}
+
 TEST(APIntTest, fromString) {
   EXPECT_EQ(APInt(32, 0), APInt(32,   "0", 2));
   EXPECT_EQ(APInt(32, 1), APInt(32,   "1", 2));
diff --git a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
index 9ff37e93b151..0ca24c716d1d 100644
--- a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
+++ b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -11,14 +11,12 @@
 
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
 #include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
@@ -130,20 +128,16 @@ public:
 
   void SetUp() override {
     TestState = llvm::make_unique<PerTestState>();
-
-    TestState->Pipeline.addCallbackToPipeline(TestState->Deserializer);
-    TestState->Pipeline.addCallbackToPipeline(TestState->Callbacks);
   }
 
   void TearDown() override { TestState.reset(); }
 
 protected:
-  bool ValidateDatabaseRecord(const RandomAccessTypeVisitor &Visitor,
-                              uint32_t Index) {
+  bool ValidateDatabaseRecord(LazyRandomTypeCollection &Types, uint32_t Index) {
     TypeIndex TI = TypeIndex::fromArrayIndex(Index);
-    if (!Visitor.database().contains(TI))
+    if (!Types.contains(TI))
       return false;
-    if (GlobalState->TypeVector[Index] != Visitor.database().getTypeRecord(TI))
+    if (GlobalState->TypeVector[Index] != Types.getType(TI))
       return false;
     return true;
   }
@@ -184,8 +178,6 @@ protected:
   struct PerTestState {
     FixedStreamArray<TypeIndexOffset> Offsets;
 
-    TypeVisitorCallbackPipeline Pipeline;
-    TypeDeserializer Deserializer;
     MockCallbacks Callbacks;
   };
 
@@ -218,21 +210,22 @@ std::unique_ptr<RandomAccessVisitorTest::GlobalTestState>
 
 TEST_F(RandomAccessVisitorTest, MultipleVisits) {
   TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
-  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
-                                  GlobalState->TypeVector.size(),
-                                  TestState->Offsets);
+  LazyRandomTypeCollection Types(GlobalState->TypeArray,
+                                 GlobalState->TypeVector.size(),
+                                 TestState->Offsets);
 
   std::vector<uint32_t> IndicesToVisit = {5, 5, 5};
 
   for (uint32_t I : IndicesToVisit) {
     TypeIndex TI = TypeIndex::fromArrayIndex(I);
-    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+    CVType T = Types.getType(TI);
+    EXPECT_NO_ERROR(codeview::visitTypeRecord(T, TI, TestState->Callbacks));
   }
 
   // [0,8) should be present
-  EXPECT_EQ(8u, Visitor.database().size());
+  EXPECT_EQ(8u, Types.size());
   for (uint32_t I = 0; I < 8; ++I)
-    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+    EXPECT_TRUE(ValidateDatabaseRecord(Types, I));
 
   // 5, 5, 5
   EXPECT_EQ(3u, TestState->Callbacks.count());
@@ -248,19 +241,19 @@ TEST_F(RandomAccessVisitorTest, DescendingWithinChunk) {
 
   std::vector<uint32_t> IndicesToVisit = {7, 4, 2};
 
-  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
-                                  GlobalState->TypeVector.size(),
-                                  TestState->Offsets);
-
+  LazyRandomTypeCollection Types(GlobalState->TypeArray,
+                                 GlobalState->TypeVector.size(),
+                                 TestState->Offsets);
   for (uint32_t I : IndicesToVisit) {
     TypeIndex TI = TypeIndex::fromArrayIndex(I);
-    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+    CVType T = Types.getType(TI);
+    EXPECT_NO_ERROR(codeview::visitTypeRecord(T, TI, TestState->Callbacks));
   }
 
   // [0, 7]
-  EXPECT_EQ(8u, Visitor.database().size());
+  EXPECT_EQ(8u, Types.size());
   for (uint32_t I = 0; I < 8; ++I)
-    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+    EXPECT_TRUE(ValidateDatabaseRecord(Types, I));
 
   // 2, 4, 7
   EXPECT_EQ(3u, TestState->Callbacks.count());
@@ -276,19 +269,19 @@ TEST_F(RandomAccessVisitorTest, AscendingWithinChunk) {
 
   std::vector<uint32_t> IndicesToVisit = {2, 4, 7};
 
-  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
-                                  GlobalState->TypeVector.size(),
-                                  TestState->Offsets);
-
+  LazyRandomTypeCollection Types(GlobalState->TypeArray,
+                                 GlobalState->TypeVector.size(),
+                                 TestState->Offsets);
   for (uint32_t I : IndicesToVisit) {
     TypeIndex TI = TypeIndex::fromArrayIndex(I);
-    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+    CVType T = Types.getType(TI);
+    EXPECT_NO_ERROR(codeview::visitTypeRecord(T, TI, TestState->Callbacks));
   }
 
   // [0, 7]
-  EXPECT_EQ(8u, Visitor.database().size());
+  EXPECT_EQ(8u, Types.size());
   for (uint32_t I = 0; I < 8; ++I)
-    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+    EXPECT_TRUE(ValidateDatabaseRecord(Types, I));
 
   // 2, 4, 7
   EXPECT_EQ(3u, TestState->Callbacks.count());
@@ -305,19 +298,20 @@ TEST_F(RandomAccessVisitorTest, StopPrematurelyInChunk) {
 
   std::vector<uint32_t> IndicesToVisit = {0, 1, 2};
 
-  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
-                                  GlobalState->TypeVector.size(),
-                                  TestState->Offsets);
+  LazyRandomTypeCollection Types(GlobalState->TypeArray,
+                                 GlobalState->TypeVector.size(),
+                                 TestState->Offsets);
 
   for (uint32_t I : IndicesToVisit) {
     TypeIndex TI = TypeIndex::fromArrayIndex(I);
-    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+    CVType T = Types.getType(TI);
+    EXPECT_NO_ERROR(codeview::visitTypeRecord(T, TI, TestState->Callbacks));
   }
 
   // [0, 8) should be visited.
-  EXPECT_EQ(8u, Visitor.database().size());
+  EXPECT_EQ(8u, Types.size());
   for (uint32_t I = 0; I < 8; ++I)
-    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+    EXPECT_TRUE(ValidateDatabaseRecord(Types, I));
 
   // [0, 2]
   EXPECT_EQ(3u, TestState->Callbacks.count());
@@ -333,19 +327,20 @@ TEST_F(RandomAccessVisitorTest, InnerChunk) {
 
   std::vector<uint32_t> IndicesToVisit = {5, 7};
 
-  RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
-                                  GlobalState->TypeVector.size(),
-                                  TestState->Offsets);
+  LazyRandomTypeCollection Types(GlobalState->TypeArray,
+                                 GlobalState->TypeVector.size(),
+                                 TestState->Offsets);
 
   for (uint32_t I : IndicesToVisit) {
     TypeIndex TI = TypeIndex::fromArrayIndex(I);
-    EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+    CVType T = Types.getType(TI);
+    EXPECT_NO_ERROR(codeview::visitTypeRecord(T, TI, TestState->Callbacks));
   }
 
   // [4, 9)
-  EXPECT_EQ(5u, Visitor.database().size());
+  EXPECT_EQ(5u, Types.size());
   for (uint32_t I = 4; I < 9; ++I)
-    EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+    EXPECT_TRUE(ValidateDatabaseRecord(Types, I));
 
   // 5, 7
   EXPECT_EQ(2u, TestState->Callbacks.count());
diff --git a/unittests/IR/AttributesTest.cpp b/unittests/IR/AttributesTest.cpp
index 0df7a847f8a5..7af4aebd540a 100644
--- a/unittests/IR/AttributesTest.cpp
+++ b/unittests/IR/AttributesTest.cpp
@@ -63,4 +63,23 @@ TEST(Attributes, AddAttributes) {
   EXPECT_TRUE(AL.hasFnAttribute(Attribute::NoReturn));
 }
 
+TEST(Attributes, AddMatchingAlignAttr) {
+  LLVMContext C;
+  AttributeList AL;
+  AL = AL.addAttribute(C, AttributeList::FirstArgIndex,
+                       Attribute::getWithAlignment(C, 8));
+  AL = AL.addAttribute(C, AttributeList::FirstArgIndex + 1,
+                       Attribute::getWithAlignment(C, 32));
+  EXPECT_EQ(8U, AL.getParamAlignment(0));
+  EXPECT_EQ(32U, AL.getParamAlignment(1));
+
+  AttrBuilder B;
+  B.addAttribute(Attribute::NonNull);
+  B.addAlignmentAttr(8);
+  AL = AL.addAttributes(C, AttributeList::FirstArgIndex, B);
+  EXPECT_EQ(8U, AL.getParamAlignment(0));
+  EXPECT_EQ(32U, AL.getParamAlignment(1));
+  EXPECT_TRUE(AL.hasParamAttribute(0, Attribute::NonNull));
+}
+
 } // end anonymous namespace
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index 6959ac85e49d..7a8a3045a0d4 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -178,14 +178,15 @@ TEST(ConstantsTest, PointerCast) {
             ConstantExpr::getAddrSpaceCast(NullInt32Ptr1, Int32PtrTy));
 }
 
-#define CHECK(x, y) {                                         		\
-    std::string __s;                                            	\
-    raw_string_ostream __o(__s);                                	\
-    Instruction *__I = cast<ConstantExpr>(x)->getAsInstruction();	\
-    __I->print(__o);      						\
-    delete __I; 							\
-    __o.flush();                                                	\
-    EXPECT_EQ(std::string("  <badref> = " y), __s);             	\
+#define CHECK(x, y)                                                            \
+  {                                                                            \
+    std::string __s;                                                           \
+    raw_string_ostream __o(__s);                                               \
+    Instruction *__I = cast<ConstantExpr>(x)->getAsInstruction();              \
+    __I->print(__o);                                                           \
+    __I->deleteValue();                                                        \
+    __o.flush();                                                               \
+    EXPECT_EQ(std::string("  <badref> = " y), __s);                            \
   }
 
 TEST(ConstantsTest, AsInstructionsTest) {
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index ae9c2684212b..498e111a31f6 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -226,7 +226,7 @@ namespace llvm {
     char DPass::ID = 0;
 
     std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context, DPass *P) {
-      const char *ModuleStrig =
+      const char *ModuleString =
         "declare i32 @g()\n" \
         "define void @f(i32 %x) personality i32 ()* @g {\n" \
         "bb0:\n" \
@@ -250,7 +250,7 @@ namespace llvm {
         "  ret void\n" \
         "}\n";
       SMDiagnostic Err;
-      return parseAssemblyString(ModuleStrig, Err, Context);
+      return parseAssemblyString(ModuleString, Err, Context);
     }
 
     TEST(DominatorTree, Unreachable) {
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index b8d398c5cc38..619ddc5413df 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -406,8 +406,8 @@ TEST(InstructionsTest, FPMathOperator) {
   EXPECT_TRUE(isa<FPMathOperator>(V1));
   FPMathOperator *O1 = cast<FPMathOperator>(V1);
   EXPECT_EQ(O1->getFPAccuracy(), 1.0);
-  delete V1;
-  delete I;
+  V1->deleteValue();
+  I->deleteValue();
 }
 
 
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index e61e649df191..9f8fc4eaeb6f 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -218,7 +218,7 @@ TEST_F(MDNodeTest, Delete) {
 
   EXPECT_EQ(n, wvh);
 
-  delete I;
+  I->deleteValue();
 }
 
 TEST_F(MDNodeTest, SelfReference) {
diff --git a/unittests/Support/BinaryStreamTest.cpp b/unittests/Support/BinaryStreamTest.cpp
index ec3b0effc9e9..1ce74cbb722b 100644
--- a/unittests/Support/BinaryStreamTest.cpp
+++ b/unittests/Support/BinaryStreamTest.cpp
@@ -289,6 +289,39 @@ TEST_F(BinaryStreamTest, StreamRefBounds) {
   }
 }
 
+TEST_F(BinaryStreamTest, DropOperations) {
+  std::vector<uint8_t> InputData = {1, 2, 3, 4, 5, 4, 3, 2, 1};
+  auto RefData = makeArrayRef(InputData);
+  initializeInput(InputData, 1);
+
+  ArrayRef<uint8_t> Result;
+  BinaryStreamRef Original(InputData, support::little);
+  ASSERT_EQ(InputData.size(), Original.getLength());
+
+  EXPECT_NO_ERROR(Original.readBytes(0, InputData.size(), Result));
+  EXPECT_EQ(RefData, Result);
+
+  auto Dropped = Original.drop_front(2);
+  EXPECT_NO_ERROR(Dropped.readBytes(0, Dropped.getLength(), Result));
+  EXPECT_EQ(RefData.drop_front(2), Result);
+
+  Dropped = Original.drop_back(2);
+  EXPECT_NO_ERROR(Dropped.readBytes(0, Dropped.getLength(), Result));
+  EXPECT_EQ(RefData.drop_back(2), Result);
+
+  Dropped = Original.keep_front(2);
+  EXPECT_NO_ERROR(Dropped.readBytes(0, Dropped.getLength(), Result));
+  EXPECT_EQ(RefData.take_front(2), Result);
+
+  Dropped = Original.keep_back(2);
+  EXPECT_NO_ERROR(Dropped.readBytes(0, Dropped.getLength(), Result));
+  EXPECT_EQ(RefData.take_back(2), Result);
+
+  Dropped = Original.drop_symmetric(2);
+  EXPECT_NO_ERROR(Dropped.readBytes(0, Dropped.getLength(), Result));
+  EXPECT_EQ(RefData.drop_front(2).drop_back(2), Result);
+}
+
 // Test that we can write to a BinaryStream without a StreamWriter.
 TEST_F(BinaryStreamTest, MutableBinaryByteStreamBounds) {
   std::vector<uint8_t> InputData = {'T', 'e', 's', 't', '\0'};
diff --git a/unittests/Support/CrashRecoveryTest.cpp b/unittests/Support/CrashRecoveryTest.cpp
index dbb0db576793..33d87a1c0e4a 100644
--- a/unittests/Support/CrashRecoveryTest.cpp
+++ b/unittests/Support/CrashRecoveryTest.cpp
@@ -17,11 +17,15 @@
 #include <windows.h>
 #endif
 
+extern "C" const char *__asan_default_options() {
+  return "allow_user_segv_handler=1";
+}
+
 using namespace llvm;
 using namespace llvm::sys;
 
 static int GlobalInt = 0;
-static void nullDeref() { *(volatile int *)nullptr = 0; }
+static void nullDeref() { *(volatile int *)0x10 = 0; }
 static void incrementGlobal() { ++GlobalInt; }
 static void llvmTrap() { LLVM_BUILTIN_TRAP; }
 
diff --git a/unittests/Support/ScaledNumberTest.cpp b/unittests/Support/ScaledNumberTest.cpp
index 2f38b2a40fb8..9e3f6de6bd17 100644
--- a/unittests/Support/ScaledNumberTest.cpp
+++ b/unittests/Support/ScaledNumberTest.cpp
@@ -335,10 +335,12 @@ TEST(ScaledNumberHelpersTest, matchScales) {
     EXPECT_EQ(SOut, matchScales(LDx, LSx, RDx, RSx));                          \
     EXPECT_EQ(LDy, LDx);                                                       \
     EXPECT_EQ(RDy, RDx);                                                       \
-    if (LDy)                                                                   \
+    if (LDy) {                                                                 \
       EXPECT_EQ(Sy, LSx);                                                      \
-    if (RDy)                                                                   \
+    }                                                                          \
+    if (RDy) {                                                                 \
       EXPECT_EQ(Sy, RSx);                                                      \
+    }                                                                          \
   } while (false)
 
   MATCH_SCALES(uint32_t, 0, 0, 0, 0, 0, 0, 0);
diff --git a/unittests/Target/AArch64/InstSizes.cpp b/unittests/Target/AArch64/InstSizes.cpp
index 5adbd7d2de43..22b47c6852ab 100644
--- a/unittests/Target/AArch64/InstSizes.cpp
+++ b/unittests/Target/AArch64/InstSizes.cpp
@@ -30,8 +30,7 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
 
 std::unique_ptr<AArch64InstrInfo> createInstrInfo(TargetMachine *TM) {
   AArch64Subtarget ST(TM->getTargetTriple(), TM->getTargetCPU(),
-                      TM->getTargetFeatureString(), *TM, /* isLittle */ false,
-                      /* ForCodeSize */ false);
+                      TM->getTargetFeatureString(), *TM, /* isLittle */ false);
   return llvm::make_unique<AArch64InstrInfo>(ST);
 }
 
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index 83f146dca704..d13547a842e4 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -41,13 +41,18 @@ protected:
   }
 
   void eraseClones() {
-    DeleteContainerPointers(Clones);
+    for (Value *V : Clones)
+      V->deleteValue();
+    Clones.clear();
   }
 
   void TearDown() override {
     eraseClones();
-    DeleteContainerPointers(Orig);
-    delete V;
+    for (Value *V : Orig)
+      V->deleteValue();
+    Orig.clear();
+    if (V)
+      V->deleteValue();
   }
 
   SmallPtrSet<Value *, 4> Orig;   // Erase on exit
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index d1014a5668a5..eb277f3298f9 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -75,6 +75,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::x86mmx:   return "MVT::x86mmx";
   case MVT::Glue:     return "MVT::Glue";
   case MVT::isVoid:   return "MVT::isVoid";
+  case MVT::v1i1:     return "MVT::v1i1";
   case MVT::v2i1:     return "MVT::v2i1";
   case MVT::v4i1:     return "MVT::v4i1";
   case MVT::v8i1:     return "MVT::v8i1";
@@ -126,6 +127,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v2f64:    return "MVT::v2f64";
   case MVT::v4f64:    return "MVT::v4f64";
   case MVT::v8f64:    return "MVT::v8f64";
+  case MVT::nxv1i1:   return "MVT::nxv1i1";
   case MVT::nxv2i1:   return "MVT::nxv2i1";
   case MVT::nxv4i1:   return "MVT::nxv4i1";
   case MVT::nxv8i1:   return "MVT::nxv8i1";
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index dc022fe1ceb2..7a500eaf4111 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -1325,27 +1325,8 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
 
   // Match the used operands (i.e. the children of the operator).
   for (unsigned i = 0, e = Src->getNumChildren(); i != e; ++i) {
-    TreePatternNode *SrcChild = Src->getChild(i);
-
-    // For G_INTRINSIC, the operand immediately following the defs is an
-    // intrinsic ID.
-    if (SrcGI.TheDef->getName() == "G_INTRINSIC" && i == 0) {
-      if (!SrcChild->isLeaf())
-        return failedImport("Expected IntInit containing intrinsic ID");
-
-      if (IntInit *SrcChildIntInit =
-              dyn_cast<IntInit>(SrcChild->getLeafValue())) {
-        OperandMatcher &OM =
-            InsnMatcher.addOperand(OpIdx++, SrcChild->getName(), TempOpIdx);
-        OM.addPredicate<IntOperandMatcher>(SrcChildIntInit->getValue());
-        continue;
-      }
-
-      return failedImport("Expected IntInit containing instrinsic ID)");
-    }
-
-    if (auto Error =
-            importChildMatcher(InsnMatcher, SrcChild, OpIdx++, TempOpIdx))
+    if (auto Error = importChildMatcher(InsnMatcher, Src->getChild(i), OpIdx++,
+                                        TempOpIdx))
       return std::move(Error);
   }
 
@@ -1380,7 +1361,7 @@ Error GlobalISelEmitter::importChildMatcher(InstructionMatcher &InsnMatcher,
 
   auto OpTyOrNone = MVTToLLT(ChildTypes.front().getConcrete());
   if (!OpTyOrNone)
-    return failedImport("Src operand has an unsupported type (" + to_string(*SrcChild) + ")");
+    return failedImport("Src operand has an unsupported type");
   OM.addPredicate<LLTOperandMatcher>(*OpTyOrNone);
 
   // Check for nested instructions.
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index 55d3129c4a82..9309889f30df 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -202,6 +202,8 @@ def fix_eol_style_native(rev, sr, svn_sr_path):
     files = git('diff-tree', '--no-commit-id', '--name-only', '-r', rev, '--',
                 sr).split('\n')
     files = [f.split('/', 1)[1] for f in files]
+    # Skip files that don't exist in SVN yet.
+    files = [f for f in files if os.path.exists(os.path.join(svn_sr_path, f))]
     # Use ignore_errors because 'svn propget' prints errors if the file doesn't
     # have the named property. There doesn't seem to be a way to suppress that.
     eol_props = svn(svn_sr_path, 'propget', 'svn:eol-style', *files,
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 3fb9def26ee8..98563db7ba59 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -506,13 +506,9 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             cmd.commands[i], out, err, res, timeoutHelper.timeoutReached(),
             output_files))
         if cmd.pipe_err:
-            # Python treats the exit code as a signed char.
-            if exitCode is None:
+            # Take the last failing exit code from the pipeline.
+            if not exitCode or res != 0:
                 exitCode = res
-            elif res < 0:
-                exitCode = min(exitCode, res)
-            else:
-                exitCode = max(exitCode, res)
         else:
             exitCode = res